From 634f0348fe336fce8f6cab1933139115e983ed2f Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Tue, 26 Nov 2019 11:30:27 -0800 Subject: [PATCH 001/118] gpio: xtensa: fix driver build Commit cad6fade6e78 ("xtensa: clean up WSR*/RSR*/get_sr/set_sr") removed {RSR,WSR}_CPENABLE from xtensa code, but did not fix up all users, breaking gpio-xtensa driver build. Update gpio-xtensa to use new xtensa_{get,set}_sr API. Cc: stable@vger.kernel.org # v5.0+ Fixes: cad6fade6e78 ("xtensa: clean up WSR*/RSR*/get_sr/set_sr") Signed-off-by: Max Filippov Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-xtensa.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-xtensa.c b/drivers/gpio/gpio-xtensa.c index 08d7c3b32038..c8af34a6368f 100644 --- a/drivers/gpio/gpio-xtensa.c +++ b/drivers/gpio/gpio-xtensa.c @@ -44,15 +44,14 @@ static inline unsigned long enable_cp(unsigned long *cpenable) unsigned long flags; local_irq_save(flags); - RSR_CPENABLE(*cpenable); - WSR_CPENABLE(*cpenable | BIT(XCHAL_CP_ID_XTIOP)); - + *cpenable = xtensa_get_sr(cpenable); + xtensa_set_sr(*cpenable | BIT(XCHAL_CP_ID_XTIOP), cpenable); return flags; } static inline void disable_cp(unsigned long flags, unsigned long cpenable) { - WSR_CPENABLE(cpenable); + xtensa_set_sr(cpenable, cpenable); local_irq_restore(flags); } From 322f6a3182d42df18059a89c53b09d33919f755e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johnson=20CH=20Chen=20=28=E9=99=B3=E6=98=AD=E5=8B=B3=29?= Date: Tue, 26 Nov 2019 06:51:11 +0000 Subject: [PATCH 002/118] gpio: mpc8xxx: Add platform device to gpiochip->parent Dear Linus Walleij, In old kernels, some APIs still try to use parent->of_node from struct gpio_chip, and it could be resulted in kernel panic because parent is NULL. Adding platform device to gpiochip->parent can fix this problem. Signed-off-by: Johnson Chen Link: https://patchwork.kernel.org/patch/11234609 Link: https://lore.kernel.org/r/HK0PR01MB3521489269F76467DFD7843FFA450@HK0PR01MB3521.apcprd01.prod.exchangelabs.com Signed-off-by: Linus Walleij --- drivers/gpio/gpio-mpc8xxx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index f1e164cecff8..5ae30de3490a 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -346,6 +346,7 @@ static int mpc8xxx_probe(struct platform_device *pdev) return -ENOMEM; gc = &mpc8xxx_gc->gc; + gc->parent = &pdev->dev; if (of_property_read_bool(np, "little-endian")) { ret = bgpio_init(gc, &pdev->dev, 4, From d935bd50dd14a7714cbdba9a76435dbb56edb1ae Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 27 Nov 2019 10:59:19 +0100 Subject: [PATCH 003/118] gpio: Fix error message on out-of-range GPIO in lookup table When a GPIO offset in a lookup table is out-of-range, the printed error message (1) does not include the actual out-of-range value, and (2) contains an off-by-one error in the upper bound. Avoid user confusion by also printing the actual GPIO offset, and correcting the upper bound of the range. While at it, use "%u" for unsigned int. Sample impact: -requested GPIO 0 is out of range [0..32] for chip e6052000.gpio +requested GPIO 0 (45) is out of range [0..31] for chip e6052000.gpio Fixes: 2a3cf6a3599e9015 ("gpiolib: return -ENOENT if no GPIO mapping exists") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20191127095919.4214-1-geert+renesas@glider.be Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 9913886ede90..dce0b31f4125 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -4472,8 +4472,9 @@ static struct gpio_desc *gpiod_find(struct device *dev, const char *con_id, if (chip->ngpio <= p->chip_hwnum) { dev_err(dev, - "requested GPIO %d is out of range [0..%d] for chip %s\n", - idx, chip->ngpio, chip->label); + "requested GPIO %u (%u) is out of range [0..%u] for chip %s\n", + idx, p->chip_hwnum, chip->ngpio - 1, + chip->label); return ERR_PTR(-EINVAL); } From bc7bc688366a4f2544699e40cce45393b7f24408 Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Wed, 11 Dec 2019 08:46:31 +0800 Subject: [PATCH 004/118] gpio: mockup: Fix usage of new GPIO_LINE_DIRECTION Restore the external behavior of gpio-mockup to what it was prior to the change to using GPIO_LINE_DIRECTION. Fixes: e42615ec233b ("gpio: Use new GPIO_LINE_DIRECTION") Signed-off-by: Kent Gibson Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mockup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-mockup.c b/drivers/gpio/gpio-mockup.c index 56d647a30e3e..c4fdc192ea4e 100644 --- a/drivers/gpio/gpio-mockup.c +++ b/drivers/gpio/gpio-mockup.c @@ -226,7 +226,7 @@ static int gpio_mockup_get_direction(struct gpio_chip *gc, unsigned int offset) int direction; mutex_lock(&chip->lock); - direction = !chip->lines[offset].dir; + direction = chip->lines[offset].dir; mutex_unlock(&chip->lock); return direction; @@ -395,7 +395,7 @@ static int gpio_mockup_probe(struct platform_device *pdev) struct gpio_chip *gc; struct device *dev; const char *name; - int rv, base; + int rv, base, i; u16 ngpio; dev = &pdev->dev; @@ -447,6 +447,9 @@ static int gpio_mockup_probe(struct platform_device *pdev) if (!chip->lines) return -ENOMEM; + for (i = 0; i < gc->ngpio; i++) + chip->lines[i].dir = GPIO_LINE_DIRECTION_IN; + if (device_property_read_bool(dev, "named-gpio-lines")) { rv = gpio_mockup_name_lines(dev, chip); if (rv) From 11e299de3aced4ea23a9fb1fef6c983c8d516302 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Dec 2019 21:28:31 +0100 Subject: [PATCH 005/118] gpio: aspeed: avoid return type warning gcc has a hard time tracking whether BUG_ON(1) ends execution or not: drivers/gpio/gpio-aspeed-sgpio.c: In function 'bank_reg': drivers/gpio/gpio-aspeed-sgpio.c:112:1: error: control reaches end of non-void function [-Werror=return-type] Use the simpler BUG() that gcc knows cannot continue. Fixes: f8b410e3695a ("gpio: aspeed-sgpio: Rename and add Kconfig/Makefile") Signed-off-by: Arnd Bergmann Acked-by: Andrew Jeffery Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aspeed-sgpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-aspeed-sgpio.c b/drivers/gpio/gpio-aspeed-sgpio.c index 7e99860ca447..8319812593e3 100644 --- a/drivers/gpio/gpio-aspeed-sgpio.c +++ b/drivers/gpio/gpio-aspeed-sgpio.c @@ -107,7 +107,7 @@ static void __iomem *bank_reg(struct aspeed_sgpio *gpio, return gpio->base + bank->irq_regs + GPIO_IRQ_STATUS; default: /* acturally if code runs to here, it's an error case */ - BUG_ON(1); + BUG(); } } From 9cc7a18bcbd47e82960a059127497ab09c4abd05 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Dec 2019 20:54:01 +0100 Subject: [PATCH 006/118] gpio: xgs-iproc: remove __exit annotation for iproc_gpio_remove When built into the kernel, the driver causes a link problem: `iproc_gpio_remove' referenced in section `.data' of drivers/gpio/gpio-xgs-iproc.o: defined in discarded section `.exit.text' of drivers/gpio/gpio-xgs-iproc.o Remove the incorrect annotation. Fixes: 6a41b6c5fc20 ("gpio: Add xgs-iproc driver") Signed-off-by: Arnd Bergmann Reviewed-by: Chris Packham Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-xgs-iproc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-xgs-iproc.c b/drivers/gpio/gpio-xgs-iproc.c index 773e5c24309e..b21c2e436b61 100644 --- a/drivers/gpio/gpio-xgs-iproc.c +++ b/drivers/gpio/gpio-xgs-iproc.c @@ -280,7 +280,7 @@ static int iproc_gpio_probe(struct platform_device *pdev) return 0; } -static int __exit iproc_gpio_remove(struct platform_device *pdev) +static int iproc_gpio_remove(struct platform_device *pdev) { struct iproc_gpio_chip *chip; From 44fe5cb32c7599a4059931a98794e0418619aa96 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Dec 2019 14:31:35 +0200 Subject: [PATCH 007/118] MAINTAINERS: Append missed file to the database When gpiolib.h internal header had been split to few, the commit 77cb907abe6c ("gpiolib: acpi: Split ACPI stuff to gpiolib-acpi.h") in particular missed the MAINTAINERS database update. Do it here. Fixes: 77cb907abe6c ("gpiolib: acpi: Split ACPI stuff to gpiolib-acpi.h") Signed-off-by: Andy Shevchenko Acked-by: Mika Westerberg Signed-off-by: Bartosz Golaszewski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index bd5847e802de..302757d8b54f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7031,6 +7031,7 @@ L: linux-acpi@vger.kernel.org S: Maintained F: Documentation/firmware-guide/acpi/gpio-properties.rst F: drivers/gpio/gpiolib-acpi.c +F: drivers/gpio/gpiolib-acpi.h GPIO IR Transmitter M: Sean Young From 256efaea1fdc4e38970489197409a26125ee0aaa Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 7 Dec 2019 16:20:18 +0000 Subject: [PATCH 008/118] gpiolib: fix up emulated open drain outputs gpiolib has a corner case with open drain outputs that are emulated. When such outputs are outputting a logic 1, emulation will set the hardware to input mode, which will cause gpiod_get_direction() to report that it is in input mode. This is different from the behaviour with a true open-drain output. Unify the semantics here. Cc: Suggested-by: Linus Walleij Signed-off-by: Russell King Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 9913886ede90..8a08a9d4f071 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -220,6 +220,14 @@ int gpiod_get_direction(struct gpio_desc *desc) chip = gpiod_to_chip(desc); offset = gpio_chip_hwgpio(desc); + /* + * Open drain emulation using input mode may incorrectly report + * input here, fix that up. + */ + if (test_bit(FLAG_OPEN_DRAIN, &desc->flags) && + test_bit(FLAG_IS_OUT, &desc->flags)) + return 0; + if (!chip->get_direction) return -ENOTSUPP; From 2688302bf1924ebccab2a3b298dd50b4f744d48d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 9 Dec 2019 14:35:44 +0200 Subject: [PATCH 009/118] gpio: pca953x: Switch to bitops in IRQ callbacks Since we have driver converted to use bitmap API we must use traditional bit operations (set_bit(), clear_bit(), etc.) against it. Currently IRQ callbacks are missed in the conversion and thus broken. Let's fix it right here right now. Fixes: 35d13d94893f ("gpio: pca953x: convert to use bitmap API") Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pca953x.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index 6652bee01966..9853547e7276 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -568,16 +568,18 @@ static void pca953x_irq_mask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct pca953x_chip *chip = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); - chip->irq_mask[d->hwirq / BANK_SZ] &= ~BIT(d->hwirq % BANK_SZ); + clear_bit(hwirq, chip->irq_mask); } static void pca953x_irq_unmask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct pca953x_chip *chip = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); - chip->irq_mask[d->hwirq / BANK_SZ] |= BIT(d->hwirq % BANK_SZ); + set_bit(hwirq, chip->irq_mask); } static int pca953x_irq_set_wake(struct irq_data *d, unsigned int on) @@ -635,8 +637,7 @@ static int pca953x_irq_set_type(struct irq_data *d, unsigned int type) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct pca953x_chip *chip = gpiochip_get_data(gc); - int bank_nb = d->hwirq / BANK_SZ; - u8 mask = BIT(d->hwirq % BANK_SZ); + irq_hw_number_t hwirq = irqd_to_hwirq(d); if (!(type & IRQ_TYPE_EDGE_BOTH)) { dev_err(&chip->client->dev, "irq %d: unsupported type %d\n", @@ -644,15 +645,8 @@ static int pca953x_irq_set_type(struct irq_data *d, unsigned int type) return -EINVAL; } - if (type & IRQ_TYPE_EDGE_FALLING) - chip->irq_trig_fall[bank_nb] |= mask; - else - chip->irq_trig_fall[bank_nb] &= ~mask; - - if (type & IRQ_TYPE_EDGE_RISING) - chip->irq_trig_raise[bank_nb] |= mask; - else - chip->irq_trig_raise[bank_nb] &= ~mask; + assign_bit(hwirq, chip->irq_trig_fall, type & IRQ_TYPE_EDGE_FALLING); + assign_bit(hwirq, chip->irq_trig_raise, type & IRQ_TYPE_EDGE_RISING); return 0; } @@ -661,10 +655,10 @@ static void pca953x_irq_shutdown(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct pca953x_chip *chip = gpiochip_get_data(gc); - u8 mask = BIT(d->hwirq % BANK_SZ); + irq_hw_number_t hwirq = irqd_to_hwirq(d); - chip->irq_trig_raise[d->hwirq / BANK_SZ] &= ~mask; - chip->irq_trig_fall[d->hwirq / BANK_SZ] &= ~mask; + clear_bit(hwirq, chip->irq_trig_raise); + clear_bit(hwirq, chip->irq_trig_fall); } static bool pca953x_irq_pending(struct pca953x_chip *chip, unsigned long *pending) From 6f7c41374b62fd80bbd8aae3536c43688c54d95e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 25 Nov 2019 10:46:51 +0900 Subject: [PATCH 010/118] tomoyo: Don't use nifty names on sockets. syzbot is reporting that use of SOCKET_I()->sk from open() can result in use after free problem [1], for socket's inode is still reachable via /proc/pid/fd/n despite destruction of SOCKET_I()->sk already completed. At first I thought that this race condition applies to only open/getattr permission checks. But James Morris has pointed out that there are more permission checks where this race condition applies to. Thus, get rid of tomoyo_get_socket_name() instead of conditionally bypassing permission checks on sockets. As a side effect of this patch, "socket:[family=\$:type=\$:protocol=\$]" in the policy files has to be rewritten to "socket:[\$]". [1] https://syzkaller.appspot.com/bug?id=73d590010454403d55164cca23bd0565b1eb3b74 Signed-off-by: Tetsuo Handa Reported-by: syzbot Reported-by: James Morris --- security/tomoyo/realpath.c | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index e7832448d721..bf38fc1b59b2 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -217,31 +217,6 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, return ERR_PTR(-ENOMEM); } -/** - * tomoyo_get_socket_name - Get the name of a socket. - * - * @path: Pointer to "struct path". - * @buffer: Pointer to buffer to return value in. - * @buflen: Sizeof @buffer. - * - * Returns the buffer. - */ -static char *tomoyo_get_socket_name(const struct path *path, char * const buffer, - const int buflen) -{ - struct inode *inode = d_backing_inode(path->dentry); - struct socket *sock = inode ? SOCKET_I(inode) : NULL; - struct sock *sk = sock ? sock->sk : NULL; - - if (sk) { - snprintf(buffer, buflen, "socket:[family=%u:type=%u:protocol=%u]", - sk->sk_family, sk->sk_type, sk->sk_protocol); - } else { - snprintf(buffer, buflen, "socket:[unknown]"); - } - return buffer; -} - /** * tomoyo_realpath_from_path - Returns realpath(3) of the given pathname but ignores chroot'ed root. * @@ -279,12 +254,7 @@ char *tomoyo_realpath_from_path(const struct path *path) break; /* To make sure that pos is '\0' terminated. */ buf[buf_len - 1] = '\0'; - /* Get better name for socket. */ - if (sb->s_magic == SOCKFS_MAGIC) { - pos = tomoyo_get_socket_name(path, buf, buf_len - 1); - goto encode; - } - /* For "pipe:[\$]". */ + /* For "pipe:[\$]" and "socket:[\$]". */ if (dentry->d_op && dentry->d_op->d_dname) { pos = dentry->d_op->d_dname(dentry, buf, buf_len - 1); goto encode; From 286e7beaa4cc7734894ee214569de4669ed9891e Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 13 Dec 2019 14:00:34 +0100 Subject: [PATCH 011/118] gpio: tegra186: Allow building on Tegra194-only configurations The driver is compatible with both Tegra186 and Tegra194, but currently it cannot be selected if only Tegra194 support is enabled. Allow builds with only Tegra194 support enabled to select this driver. While at it, select this driver by default on Tegra194 builds because it is an essential part of the system. Signed-off-by: Thierry Reding Link: https://lore.kernel.org/r/20191213130034.219227-1-thierry.reding@gmail.com Signed-off-by: Linus Walleij --- drivers/gpio/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 8adffd42f8cb..6ab25fe1c423 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -553,8 +553,8 @@ config GPIO_TEGRA config GPIO_TEGRA186 tristate "NVIDIA Tegra186 GPIO support" - default ARCH_TEGRA_186_SOC - depends on ARCH_TEGRA_186_SOC || COMPILE_TEST + default ARCH_TEGRA_186_SOC || ARCH_TEGRA_194_SOC + depends on ARCH_TEGRA_186_SOC || ARCH_TEGRA_194_SOC || COMPILE_TEST depends on OF_GPIO select GPIOLIB_IRQCHIP select IRQ_DOMAIN_HIERARCHY From 6bd5ce6089b561f5392460bfb654dea89356ab1b Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 16 Dec 2019 19:16:48 +0900 Subject: [PATCH 012/118] tomoyo: Suppress RCU warning at list_for_each_entry_rcu(). John Garry has reported that allmodconfig kernel on arm64 causes flood of "RCU-list traversed in non-reader section!!" warning. I don't know what change caused this warning, but this warning is safe because TOMOYO uses SRCU lock instead. Let's suppress this warning by explicitly telling that the caller is holding SRCU lock. Reported-and-tested-by: John Garry Signed-off-by: Tetsuo Handa --- security/tomoyo/common.c | 9 ++++++--- security/tomoyo/domain.c | 15 ++++++++++----- security/tomoyo/group.c | 9 ++++++--- security/tomoyo/util.c | 6 ++++-- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index dd3d5942e669..c36bafbcd77e 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -951,7 +951,8 @@ static bool tomoyo_manager(void) exe = tomoyo_get_exe(); if (!exe) return false; - list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], head.list) { + list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], head.list, + srcu_read_lock_held(&tomoyo_ss)) { if (!ptr->head.is_deleted && (!tomoyo_pathcmp(domainname, ptr->manager) || !strcmp(exe, ptr->manager->name))) { @@ -1095,7 +1096,8 @@ static int tomoyo_delete_domain(char *domainname) if (mutex_lock_interruptible(&tomoyo_policy_lock)) return -EINTR; /* Is there an active domain? */ - list_for_each_entry_rcu(domain, &tomoyo_domain_list, list) { + list_for_each_entry_rcu(domain, &tomoyo_domain_list, list, + srcu_read_lock_held(&tomoyo_ss)) { /* Never delete tomoyo_kernel_domain */ if (domain == &tomoyo_kernel_domain) continue; @@ -2778,7 +2780,8 @@ void tomoyo_check_profile(void) tomoyo_policy_loaded = true; pr_info("TOMOYO: 2.6.0\n"); - list_for_each_entry_rcu(domain, &tomoyo_domain_list, list) { + list_for_each_entry_rcu(domain, &tomoyo_domain_list, list, + srcu_read_lock_held(&tomoyo_ss)) { const u8 profile = domain->profile; struct tomoyo_policy_namespace *ns = domain->ns; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 8526a0a74023..7869d6a9980b 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -41,7 +41,8 @@ int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size, if (mutex_lock_interruptible(&tomoyo_policy_lock)) return -ENOMEM; - list_for_each_entry_rcu(entry, list, list) { + list_for_each_entry_rcu(entry, list, list, + srcu_read_lock_held(&tomoyo_ss)) { if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS) continue; if (!check_duplicate(entry, new_entry)) @@ -119,7 +120,8 @@ int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size, } if (mutex_lock_interruptible(&tomoyo_policy_lock)) goto out; - list_for_each_entry_rcu(entry, list, list) { + list_for_each_entry_rcu(entry, list, list, + srcu_read_lock_held(&tomoyo_ss)) { if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS) continue; if (!tomoyo_same_acl_head(entry, new_entry) || @@ -166,7 +168,8 @@ void tomoyo_check_acl(struct tomoyo_request_info *r, u16 i = 0; retry: - list_for_each_entry_rcu(ptr, list, list) { + list_for_each_entry_rcu(ptr, list, list, + srcu_read_lock_held(&tomoyo_ss)) { if (ptr->is_deleted || ptr->type != r->param_type) continue; if (!check_entry(r, ptr)) @@ -298,7 +301,8 @@ static inline bool tomoyo_scan_transition { const struct tomoyo_transition_control *ptr; - list_for_each_entry_rcu(ptr, list, head.list) { + list_for_each_entry_rcu(ptr, list, head.list, + srcu_read_lock_held(&tomoyo_ss)) { if (ptr->head.is_deleted || ptr->type != type) continue; if (ptr->domainname) { @@ -735,7 +739,8 @@ int tomoyo_find_next_domain(struct linux_binprm *bprm) /* Check 'aggregator' directive. */ candidate = &exename; - list_for_each_entry_rcu(ptr, list, head.list) { + list_for_each_entry_rcu(ptr, list, head.list, + srcu_read_lock_held(&tomoyo_ss)) { if (ptr->head.is_deleted || !tomoyo_path_matches_pattern(&exename, ptr->original_name)) diff --git a/security/tomoyo/group.c b/security/tomoyo/group.c index a37c7dc66e44..1cecdd797597 100644 --- a/security/tomoyo/group.c +++ b/security/tomoyo/group.c @@ -133,7 +133,8 @@ tomoyo_path_matches_group(const struct tomoyo_path_info *pathname, { struct tomoyo_path_group *member; - list_for_each_entry_rcu(member, &group->member_list, head.list) { + list_for_each_entry_rcu(member, &group->member_list, head.list, + srcu_read_lock_held(&tomoyo_ss)) { if (member->head.is_deleted) continue; if (!tomoyo_path_matches_pattern(pathname, member->member_name)) @@ -161,7 +162,8 @@ bool tomoyo_number_matches_group(const unsigned long min, struct tomoyo_number_group *member; bool matched = false; - list_for_each_entry_rcu(member, &group->member_list, head.list) { + list_for_each_entry_rcu(member, &group->member_list, head.list, + srcu_read_lock_held(&tomoyo_ss)) { if (member->head.is_deleted) continue; if (min > member->number.values[1] || @@ -191,7 +193,8 @@ bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address, bool matched = false; const u8 size = is_ipv6 ? 16 : 4; - list_for_each_entry_rcu(member, &group->member_list, head.list) { + list_for_each_entry_rcu(member, &group->member_list, head.list, + srcu_read_lock_held(&tomoyo_ss)) { if (member->head.is_deleted) continue; if (member->address.is_ipv6 != is_ipv6) diff --git a/security/tomoyo/util.c b/security/tomoyo/util.c index 52752e1a84ed..eba0b3395851 100644 --- a/security/tomoyo/util.c +++ b/security/tomoyo/util.c @@ -594,7 +594,8 @@ struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname) name.name = domainname; tomoyo_fill_path_info(&name); - list_for_each_entry_rcu(domain, &tomoyo_domain_list, list) { + list_for_each_entry_rcu(domain, &tomoyo_domain_list, list, + srcu_read_lock_held(&tomoyo_ss)) { if (!domain->is_deleted && !tomoyo_pathcmp(&name, domain->domainname)) return domain; @@ -1028,7 +1029,8 @@ bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r) return false; if (!domain) return true; - list_for_each_entry_rcu(ptr, &domain->acl_info_list, list) { + list_for_each_entry_rcu(ptr, &domain->acl_info_list, list, + srcu_read_lock_held(&tomoyo_ss)) { u16 perm; u8 i; From e0ff126ee7ad405c1ef531f9f3db92929de4f20f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 16 Dec 2019 11:54:32 +0100 Subject: [PATCH 013/118] pipe: Fix bogus dereference in iov_iter_alignment() We cannot look at 'i->pipe' unless we know the iter is a pipe. Move the ring_size load to a branch in iov_iter_alignment() where we've already checked the iter is a pipe to avoid bogus dereference. Reported-by: syzbot+bea68382bae9490e7dd6@syzkaller.appspotmail.com Fixes: 8cefc107ca54 ("pipe: Use head and tail pointers for the ring, not cursor and length") Signed-off-by: Jan Kara Signed-off-by: Al Viro --- lib/iov_iter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index fb29c02c6a3c..51595bf3af85 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1222,11 +1222,12 @@ EXPORT_SYMBOL(iov_iter_discard); unsigned long iov_iter_alignment(const struct iov_iter *i) { - unsigned int p_mask = i->pipe->ring_size - 1; unsigned long res = 0; size_t size = i->count; if (unlikely(iov_iter_is_pipe(i))) { + unsigned int p_mask = i->pipe->ring_size - 1; + if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask])) return size | i->iov_offset; return size; From ee560e7bbab0c10cf3f0e71997fbc354ab2ee5cb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 3 Dec 2019 12:36:52 +0300 Subject: [PATCH 014/118] scsi: mpt3sas: Fix double free in attach error handling The caller also calls _base_release_memory_pools() on error so it leads to a number of double frees: drivers/scsi/mpt3sas/mpt3sas_base.c:7207 mpt3sas_base_attach() warn: 'ioc->chain_dma_pool' double freed drivers/scsi/mpt3sas/mpt3sas_base.c:7207 mpt3sas_base_attach() warn: 'ioc->hpr_lookup' double freed drivers/scsi/mpt3sas/mpt3sas_base.c:7207 mpt3sas_base_attach() warn: 'ioc->internal_lookup' double freed drivers/scsi/mpt3sas/mpt3sas_base.c:7207 mpt3sas_base_attach() warn: 'ioc->pcie_sgl_dma_pool' double freed drivers/scsi/mpt3sas/mpt3sas_base.c:7207 mpt3sas_base_attach() warn: 'ioc->reply_dma_pool' double freed drivers/scsi/mpt3sas/mpt3sas_base.c:7207 mpt3sas_base_attach() warn: 'ioc->reply_free_dma_pool' double freed drivers/scsi/mpt3sas/mpt3sas_base.c:7207 mpt3sas_base_attach() warn: 'ioc->reply_post_free_array_dma_pool' double freed drivers/scsi/mpt3sas/mpt3sas_base.c:7207 mpt3sas_base_attach() warn: 'ioc->reply_post_free_dma_pool' double freed drivers/scsi/mpt3sas/mpt3sas_base.c:7207 mpt3sas_base_attach() warn: 'ioc->sense_dma_pool' double freed Fixes: 74522a92bbf0 ("scsi: mpt3sas: Optimize I/O memory consumption in driver.") Link: https://lore.kernel.org/r/20191203093652.gyntgvnkw2udatyc@kili.mountain Signed-off-by: Dan Carpenter Acked-by: Sreekanth Reddy Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_base.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 848fbec7bda6..45fd8dfb7c40 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -5248,7 +5248,6 @@ _base_allocate_memory_pools(struct MPT3SAS_ADAPTER *ioc) &ct->chain_buffer_dma); if (!ct->chain_buffer) { ioc_err(ioc, "chain_lookup: pci_pool_alloc failed\n"); - _base_release_memory_pools(ioc); goto out; } } From 201743b99c0548ffa8ddbd2571b1dcf4d79b73a9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Dec 2019 14:16:49 +0100 Subject: [PATCH 015/118] scsi: lpfc: fix build failure with DEBUGFS disabled A recent change appears to have moved an #endif by accident: drivers/scsi/lpfc/lpfc_debugfs.c:5393:18: error: 'lpfc_debugfs_dumpHBASlim_open' undeclared here (not in a function); did you mean 'lpfc_debugfs_op_dumpHBASlim'? drivers/scsi/lpfc/lpfc_debugfs.c:5394:18: error: 'lpfc_debugfs_lseek' undeclared here (not in a function); did you mean 'lpfc_debugfs_nvme_trc'? drivers/scsi/lpfc/lpfc_debugfs.c:5395:18: error: 'lpfc_debugfs_read' undeclared here (not in a function); did you mean 'lpfc_debug_dump_q'? drivers/scsi/lpfc/lpfc_debugfs.c:5396:18: error: 'lpfc_debugfs_release' undeclared here (not in a function); did you mean 'lpfc_debugfs_terminate'? drivers/scsi/lpfc/lpfc_debugfs.c:5402:18: error: 'lpfc_debugfs_dumpHostSlim_open' undeclared here (not in a function); did you mean 'lpfc_debugfs_op_dumpHostSlim'? Move it back to where it was previously. Fixes: 95bfc6d8ad86 ("scsi: lpfc: Make FW logging dynamically configurable") Link: https://lore.kernel.org/r/20191216131701.3125077-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Reviewed-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_debugfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index 2e6a68d9ea4f..a5ecbce4eda2 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -5385,7 +5385,6 @@ static const struct file_operations lpfc_debugfs_ras_log = { .read = lpfc_debugfs_read, .release = lpfc_debugfs_ras_log_release, }; -#endif #undef lpfc_debugfs_op_dumpHBASlim static const struct file_operations lpfc_debugfs_op_dumpHBASlim = { @@ -5557,7 +5556,7 @@ static const struct file_operations lpfc_idiag_op_extAcc = { .write = lpfc_idiag_extacc_write, .release = lpfc_idiag_cmd_release, }; - +#endif /* lpfc_idiag_mbxacc_dump_bsg_mbox - idiag debugfs dump bsg mailbox command * @phba: Pointer to HBA context object. From 04646aebd30b99f2cfa0182435a2ec252fcb16d0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 6 Dec 2019 10:54:23 -0600 Subject: [PATCH 016/118] fs: avoid softlockups in s_inodes iterators Anything that walks all inodes on sb->s_inodes list without rescheduling risks softlockups. Previous efforts were made in 2 functions, see: c27d82f fs/drop_caches.c: avoid softlockups in drop_pagecache_sb() ac05fbb inode: don't softlockup when evicting inodes but there hasn't been an audit of all walkers, so do that now. This also consistently moves the cond_resched() calls to the bottom of each loop in cases where it already exists. One loop remains: remove_dquot_ref(), because I'm not quite sure how to deal with that one w/o taking the i_lock. Signed-off-by: Eric Sandeen Reviewed-by: Jan Kara Signed-off-by: Al Viro --- fs/drop_caches.c | 2 +- fs/inode.c | 7 +++++++ fs/notify/fsnotify.c | 1 + fs/quota/dquot.c | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d31b6c72b476..dc1a1d5d825b 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -35,11 +35,11 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); - cond_resched(); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; + cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/inode.c b/fs/inode.c index fef457a42882..96d62d97694e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -676,6 +676,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); +again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); @@ -698,6 +699,12 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); + if (need_resched()) { + spin_unlock(&sb->s_inode_list_lock); + cond_resched(); + dispose_list(&dispose); + goto again; + } } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 3e77b728a22b..6c902c0a1a5e 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -77,6 +77,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) iput_inode = inode; + cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index b0688c02dc90..b6a4f692d345 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -984,6 +984,7 @@ static int add_dquot_ref(struct super_block *sb, int type) * later. */ old_inode = inode; + cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); From 1edc8eb2e93130e36ac74ac9c80913815a57d413 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 6 Dec 2019 10:55:59 -0600 Subject: [PATCH 017/118] fs: call fsnotify_sb_delete after evict_inodes When a filesystem is unmounted, we currently call fsnotify_sb_delete() before evict_inodes(), which means that fsnotify_unmount_inodes() must iterate over all inodes on the superblock looking for any inodes with watches. This is inefficient and can lead to livelocks as it iterates over many unwatched inodes. At this point, SB_ACTIVE is gone and dropping refcount to zero kicks the inode out out immediately, so anything processed by fsnotify_sb_delete / fsnotify_unmount_inodes gets evicted in that loop. After that, the call to evict_inodes will evict everything else with a zero refcount. This should speed things up overall, and avoid livelocks in fsnotify_unmount_inodes(). Signed-off-by: Eric Sandeen Reviewed-by: Jan Kara Signed-off-by: Al Viro --- fs/notify/fsnotify.c | 3 +++ fs/super.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 6c902c0a1a5e..46f225580009 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -57,6 +57,9 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. + * However, we should have been called /after/ evict_inodes + * removed all zero refcount inodes, in any case. Test to + * be sure. */ if (!atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); diff --git a/fs/super.c b/fs/super.c index cfadab2cbf35..cd352530eca9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -448,10 +448,12 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; - fsnotify_sb_delete(sb); cgroup_writeback_umount(); + /* evict all inodes with zero refcount */ evict_inodes(sb); + /* only nonzero refcount inodes can have marks */ + fsnotify_sb_delete(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); From 291c2548458d3a8b326bfa8d7abbba86e036faf7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 18 Dec 2019 08:43:01 +0000 Subject: [PATCH 018/118] scsi: lpfc: fix spelling mistakes of asynchronous There are spelling mistakes of asynchronous in a lpfc_printf_log message and comments. Fix these. Link: https://lore.kernel.org/r/20191218084301.627555-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 2 +- drivers/scsi/lpfc/lpfc_sli.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 6298b1729098..6a04fdb3fbf2 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -5883,7 +5883,7 @@ void lpfc_sli4_async_event_proc(struct lpfc_hba *phba) break; default: lpfc_printf_log(phba, KERN_ERR, LOG_SLI, - "1804 Invalid asynchrous event code: " + "1804 Invalid asynchronous event code: " "x%x\n", bf_get(lpfc_trailer_code, &cq_event->cqe.mcqe_cmpl)); break; diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index c82b5792da98..625c046ac4ef 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -8555,7 +8555,7 @@ lpfc_sli4_async_mbox_unblock(struct lpfc_hba *phba) psli->sli_flag &= ~LPFC_SLI_ASYNC_MBX_BLK; spin_unlock_irq(&phba->hbalock); - /* wake up worker thread to post asynchronlous mailbox command */ + /* wake up worker thread to post asynchronous mailbox command */ lpfc_worker_wake_up(phba); } @@ -8823,7 +8823,7 @@ lpfc_sli_issue_mbox_s4(struct lpfc_hba *phba, LPFC_MBOXQ_t *mboxq, return rc; } - /* Now, interrupt mode asynchrous mailbox command */ + /* Now, interrupt mode asynchronous mailbox command */ rc = lpfc_mbox_cmd_check(phba, mboxq); if (rc) { lpfc_printf_log(phba, KERN_ERR, LOG_MBOX | LOG_SLI, @@ -13112,11 +13112,11 @@ lpfc_cq_event_setup(struct lpfc_hba *phba, void *entry, int size) } /** - * lpfc_sli4_sp_handle_async_event - Handle an asynchroous event + * lpfc_sli4_sp_handle_async_event - Handle an asynchronous event * @phba: Pointer to HBA context object. * @cqe: Pointer to mailbox completion queue entry. * - * This routine process a mailbox completion queue entry with asynchrous + * This routine process a mailbox completion queue entry with asynchronous * event. * * Return: true if work posted to worker thread, otherwise false. @@ -13270,7 +13270,7 @@ lpfc_sli4_sp_handle_mbox_event(struct lpfc_hba *phba, struct lpfc_mcqe *mcqe) * @cqe: Pointer to mailbox completion queue entry. * * This routine process a mailbox completion queue entry, it invokes the - * proper mailbox complete handling or asynchrous event handling routine + * proper mailbox complete handling or asynchronous event handling routine * according to the MCQE's async bit. * * Return: true if work posted to worker thread, otherwise false. From 71482fde704efdd8c3abe0faf34d922c61e8d76b Mon Sep 17 00:00:00 2001 From: Varun Prakash Date: Wed, 18 Dec 2019 19:15:31 +0530 Subject: [PATCH 019/118] scsi: libcxgbi: fix NULL pointer dereference in cxgbi_device_destroy() If cxgb4i_ddp_init() fails then cdev->cdev2ppm will be NULL, so add a check for NULL pointer before dereferencing it. Link: https://lore.kernel.org/r/1576676731-3068-1-git-send-email-varun@chelsio.com Signed-off-by: Varun Prakash Signed-off-by: Martin K. Petersen --- drivers/scsi/cxgbi/libcxgbi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c index 0d044c165960..dc9320f37ebf 100644 --- a/drivers/scsi/cxgbi/libcxgbi.c +++ b/drivers/scsi/cxgbi/libcxgbi.c @@ -121,7 +121,8 @@ static inline void cxgbi_device_destroy(struct cxgbi_device *cdev) "cdev 0x%p, p# %u.\n", cdev, cdev->nports); cxgbi_hbas_remove(cdev); cxgbi_device_portmap_cleanup(cdev); - cxgbi_ppm_release(cdev->cdev2ppm(cdev)); + if (cdev->cdev2ppm) + cxgbi_ppm_release(cdev->cdev2ppm(cdev)); if (cdev->pmap.max_connect) cxgbi_free_big_mem(cdev->pmap.port_csk); kfree(cdev); From e4dc9a4c31fe10d1751c542702afc85be8a5c56a Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 11 Dec 2019 17:36:02 +0200 Subject: [PATCH 020/118] scsi: target/iblock: Fix protection error with blocks greater than 512B The sector size of the block layer is 512 bytes, but integrity interval size might be different (in case of 4K block size of the media). At the initiator side the virtual start sector is the one that was originally submitted by the block layer (512 bytes) for the Reftag usage. The initiator converts the Reftag to integrity interval units and sends it to the target. So the target virtual start sector should be calculated at integrity interval units. prepare_fn() and complete_fn() don't remap correctly the Reftag when using incorrect units of the virtual start sector, which leads to the following protection error at the device: "blk_update_request: protection error, dev sdb, sector 2048 op 0x0:(READ) flags 0x10000 phys_seg 1 prio class 0" To fix that, set the seed in integrity interval units. Link: https://lore.kernel.org/r/1576078562-15240-1-git-send-email-israelr@mellanox.com Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Martin K. Petersen --- drivers/target/target_core_iblock.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 6949ea8bc387..51ffd5c002de 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -646,7 +646,9 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio, } bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); - bip_set_seed(bip, bio->bi_iter.bi_sector); + /* virtual start sector must be in integrity interval units */ + bip_set_seed(bip, bio->bi_iter.bi_sector >> + (bi->interval_exp - SECTOR_SHIFT)); pr_debug("IBLOCK BIP Size: %u Sector: %llu\n", bip->bip_iter.bi_size, (unsigned long long)bip->bip_iter.bi_sector); From c9b3b8207bc487de02cbca968927ba2c2cb46aaf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Dec 2019 21:24:28 +0100 Subject: [PATCH 021/118] netfilter: nf_flow_table: fix big-endian integer overflow In some configurations, gcc reports an integer overflow: net/netfilter/nf_flow_table_offload.c: In function 'nf_flow_rule_match': net/netfilter/nf_flow_table_offload.c:80:21: error: unsigned conversion from 'int' to '__be16' {aka 'short unsigned int'} changes value from '327680' to '0' [-Werror=overflow] mask->tcp.flags = TCP_FLAG_RST | TCP_FLAG_FIN; ^~~~~~~~~~~~ From what I can tell, we want the upper 16 bits of these constants, so they need to be shifted in cpu-endian mode. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Signed-off-by: Arnd Bergmann Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index de7a0d1e15c8..0d72e5ccb47b 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -88,7 +88,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, switch (tuple->l4proto) { case IPPROTO_TCP: key->tcp.flags = 0; - mask->tcp.flags = TCP_FLAG_RST | TCP_FLAG_FIN; + mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16); match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); break; case IPPROTO_UDP: From d05d5db815d56a0ce203ed297153d9794dfdcb68 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 13 Dec 2019 01:19:58 +0100 Subject: [PATCH 022/118] selftests: netfilter: extend flowtable test script with dnat rule NAT test currently covers snat (masquerade) only. Also add a dnat rule and then check that a connecting to the to-be-dnated address will work. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- .../selftests/netfilter/nft_flowtable.sh | 39 ++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 16571ac1dab4..d3e0809ab368 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -226,17 +226,19 @@ check_transfer() return 0 } -test_tcp_forwarding() +test_tcp_forwarding_ip() { local nsa=$1 local nsb=$2 + local dstip=$3 + local dstport=$4 local lret=0 ip netns exec $nsb nc -w 5 -l -p 12345 < "$ns2in" > "$ns2out" & lpid=$! sleep 1 - ip netns exec $nsa nc -w 4 10.0.2.99 12345 < "$ns1in" > "$ns1out" & + ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$ns1in" > "$ns1out" & cpid=$! sleep 3 @@ -258,6 +260,28 @@ test_tcp_forwarding() return $lret } +test_tcp_forwarding() +{ + test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 + + return $? +} + +test_tcp_forwarding_nat() +{ + local lret + + test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 + lret=$? + + if [ $lret -eq 0 ] ; then + test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666 + lret=$? + fi + + return $lret +} + make_file "$ns1in" "ns1" make_file "$ns2in" "ns2" @@ -283,14 +307,19 @@ ip -net ns2 route add 192.168.10.1 via 10.0.2.1 # Same, but with NAT enabled. ip netns exec nsr1 nft -f - < /dev/null ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null -test_tcp_forwarding ns1 ns2 +test_tcp_forwarding_nat ns1 ns2 if [ $? -eq 0 ] ;then echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery" else From e608f631f0ba5f1fc5ee2e260a3a35d13107cbfe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 15 Dec 2019 03:49:25 +0100 Subject: [PATCH 023/118] netfilter: ebtables: compat: reject all padding in matches/watchers syzbot reported following splat: BUG: KASAN: vmalloc-out-of-bounds in size_entry_mwt net/bridge/netfilter/ebtables.c:2063 [inline] BUG: KASAN: vmalloc-out-of-bounds in compat_copy_entries+0x128b/0x1380 net/bridge/netfilter/ebtables.c:2155 Read of size 4 at addr ffffc900004461f4 by task syz-executor267/7937 CPU: 1 PID: 7937 Comm: syz-executor267 Not tainted 5.5.0-rc1-syzkaller #0 size_entry_mwt net/bridge/netfilter/ebtables.c:2063 [inline] compat_copy_entries+0x128b/0x1380 net/bridge/netfilter/ebtables.c:2155 compat_do_replace+0x344/0x720 net/bridge/netfilter/ebtables.c:2249 compat_do_ebt_set_ctl+0x22f/0x27e net/bridge/netfilter/ebtables.c:2333 [..] Because padding isn't considered during computation of ->buf_user_offset, "total" is decremented by fewer bytes than it should. Therefore, the first part of if (*total < sizeof(*entry) || entry->next_offset < sizeof(*entry)) will pass, -- it should not have. This causes oob access: entry->next_offset is past the vmalloced size. Reject padding and check that computed user offset (sum of ebt_entry structure plus all individual matches/watchers/targets) is same value that userspace gave us as the offset of the next entry. Reported-by: syzbot+f68108fed972453a0ad4@syzkaller.appspotmail.com Fixes: 81e675c227ec ("netfilter: ebtables: add CONFIG_COMPAT support") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 35 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 4096d8a74a2b..e1256e03a9a8 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1867,7 +1867,7 @@ static int ebt_buf_count(struct ebt_entries_buf_state *state, unsigned int sz) } static int ebt_buf_add(struct ebt_entries_buf_state *state, - void *data, unsigned int sz) + const void *data, unsigned int sz) { if (state->buf_kern_start == NULL) goto count_only; @@ -1901,7 +1901,7 @@ enum compat_mwt { EBT_COMPAT_TARGET, }; -static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, +static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, enum compat_mwt compat_mwt, struct ebt_entries_buf_state *state, const unsigned char *base) @@ -1979,22 +1979,23 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, /* return size of all matches, watchers or target, including necessary * alignment and padding. */ -static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, +static int ebt_size_mwt(const struct compat_ebt_entry_mwt *match32, unsigned int size_left, enum compat_mwt type, struct ebt_entries_buf_state *state, const void *base) { + const char *buf = (const char *)match32; int growth = 0; - char *buf; if (size_left == 0) return 0; - buf = (char *) match32; - - while (size_left >= sizeof(*match32)) { + do { struct ebt_entry_match *match_kern; int ret; + if (size_left < sizeof(*match32)) + return -EINVAL; + match_kern = (struct ebt_entry_match *) state->buf_kern_start; if (match_kern) { char *tmp; @@ -2031,22 +2032,18 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, if (match_kern) match_kern->match_size = ret; - /* rule should have no remaining data after target */ - if (type == EBT_COMPAT_TARGET && size_left) - return -EINVAL; - match32 = (struct compat_ebt_entry_mwt *) buf; - } + } while (size_left); return growth; } /* called for all ebt_entry structures. */ -static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, +static int size_entry_mwt(const struct ebt_entry *entry, const unsigned char *base, unsigned int *total, struct ebt_entries_buf_state *state) { - unsigned int i, j, startoff, new_offset = 0; + unsigned int i, j, startoff, next_expected_off, new_offset = 0; /* stores match/watchers/targets & offset of next struct ebt_entry: */ unsigned int offsets[4]; unsigned int *offsets_update = NULL; @@ -2132,11 +2129,13 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, return ret; } - startoff = state->buf_user_offset - startoff; - - if (WARN_ON(*total < startoff)) + next_expected_off = state->buf_user_offset - startoff; + if (next_expected_off != entry->next_offset) return -EINVAL; - *total -= startoff; + + if (*total < entry->next_offset) + return -EINVAL; + *total -= entry->next_offset; return 0; } From 8cb4ec44de42b99b92399b4d1daf3dc430ed0186 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 18 Dec 2019 00:59:29 +0100 Subject: [PATCH 024/118] netfilter: nft_tproxy: Fix port selector on Big Endian MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Big Endian architectures, u16 port value was extracted from the wrong parts of u32 sreg_port, just like commit 10596608c4d62 ("netfilter: nf_tables: fix mismatch in big-endian system") describes. Fixes: 4ed8eb6570a49 ("netfilter: nf_tables: Add native tproxy support") Signed-off-by: Phil Sutter Acked-by: Florian Westphal Acked-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tproxy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index f92a82c73880..95980154ef02 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -50,7 +50,7 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr); if (priv->sreg_port) - tport = regs->data[priv->sreg_port]; + tport = nft_reg_load16(®s->data[priv->sreg_port]); if (!tport) tport = hp->dest; @@ -117,7 +117,7 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr); if (priv->sreg_port) - tport = regs->data[priv->sreg_port]; + tport = nft_reg_load16(®s->data[priv->sreg_port]); if (!tport) tport = hp->dest; From d55e5f5b70dd6214ef81fb2313121b72a7dd2200 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Dec 2019 16:12:15 -0700 Subject: [PATCH 025/118] io_uring: use u64_to_user_ptr() consistently We use it in some spots, but not consistently. Convert the rest over, makes it easier to read as well. No functional changes in this patch. Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f084e3cf835..7a23d2351be2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2157,7 +2157,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) unsigned flags; flags = READ_ONCE(sqe->msg_flags); - msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); io->msg.iov = io->msg.fast_iov; return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); #else @@ -2239,7 +2239,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) unsigned flags; flags = READ_ONCE(sqe->msg_flags); - msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); io->msg.iov = io->msg.fast_iov; return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, &io->msg.iov); @@ -2273,8 +2273,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, else if (force_nonblock) flags |= MSG_DONTWAIT; - msg = (struct user_msghdr __user *) (unsigned long) - READ_ONCE(sqe->addr); + msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); if (req->io) { kmsg = &req->io->msg; kmsg->msg.msg_name = &addr; @@ -2331,9 +2330,8 @@ static int io_accept_prep(struct io_kiocb *req) if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; - accept->addr = (struct sockaddr __user *) - (unsigned long) READ_ONCE(sqe->addr); - accept->addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); + accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); req->flags |= REQ_F_PREPPED; return 0; @@ -2407,7 +2405,7 @@ static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) struct sockaddr __user *addr; int addr_len; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); + addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); addr_len = READ_ONCE(sqe->addr2); return move_addr_to_kernel(addr, addr_len, &io->connect.address); #else @@ -4702,7 +4700,7 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) return -EFAULT; - dst->iov_base = (void __user *) (unsigned long) ciov.iov_base; + dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); dst->iov_len = ciov.iov_len; return 0; } From f33121cbe91973a08e68e4bde8c3f7e6e4e351c1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 18 Dec 2019 16:38:49 +0000 Subject: [PATCH 026/118] rxrpc: Unlock new call in rxrpc_new_incoming_call() rather than the caller Move the unlock and the ping transmission for a new incoming call into rxrpc_new_incoming_call() rather than doing it in the caller. This makes it clearer to see what's going on. Suggested-by: Peter Zijlstra Signed-off-by: David Howells Acked-by: Peter Zijlstra (Intel) cc: Ingo Molnar cc: Will Deacon cc: Davidlohr Bueso --- net/rxrpc/call_accept.c | 36 ++++++++++++++++++++++++++++-------- net/rxrpc/input.c | 18 ------------------ 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 135bf5cd8dd5..3685b1732f65 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -239,6 +239,22 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) kfree(b); } +/* + * Ping the other end to fill our RTT cache and to retrieve the rwind + * and MTU parameters. + */ +static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + ktime_t now = skb->tstamp; + + if (call->peer->rtt_usage < 3 || + ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) + rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, + true, true, + rxrpc_propose_ack_ping_for_params); +} + /* * Allocate a new incoming call from the prealloc pool, along with a connection * and a peer as necessary. @@ -346,9 +362,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = RX_INVALID_OPERATION; - _leave(" = NULL [close]"); - call = NULL; - goto out; + goto no_call; } /* The peer, connection and call may all have sprung into existence due @@ -361,9 +375,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, call = rxrpc_alloc_incoming_call(rx, local, peer, conn, skb); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; - _leave(" = NULL [busy]"); - call = NULL; - goto out; + goto no_call; } trace_rxrpc_receive(call, rxrpc_receive_incoming, @@ -432,10 +444,18 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, */ rxrpc_put_call(call, rxrpc_call_put); - _leave(" = %p{%d}", call, call->debug_id); -out: spin_unlock(&rx->incoming_lock); + + rxrpc_send_ping(call, skb); + mutex_unlock(&call->user_mutex); + + _leave(" = %p{%d}", call, call->debug_id); return call; + +no_call: + spin_unlock(&rx->incoming_lock); + _leave(" = NULL [%u]", skb->mark); + return NULL; } /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 157be1ff8697..86bd133b4fa0 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -192,22 +192,6 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, goto out_no_clear_ca; } -/* - * Ping the other end to fill our RTT cache and to retrieve the rwind - * and MTU parameters. - */ -static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - ktime_t now = skb->tstamp; - - if (call->peer->rtt_usage < 3 || - ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) - rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, - true, true, - rxrpc_propose_ack_ping_for_params); -} - /* * Apply a hard ACK by advancing the Tx window. */ @@ -1396,8 +1380,6 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) call = rxrpc_new_incoming_call(local, rx, skb); if (!call) goto reject_packet; - rxrpc_send_ping(call, skb); - mutex_unlock(&call->user_mutex); } /* Process a call packet; this either discards or passes on the ref From 13b7955a0252e15265386b229b814152f109b234 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Dec 2019 16:20:56 +0000 Subject: [PATCH 027/118] rxrpc: Don't take call->user_mutex in rxrpc_new_incoming_call() Standard kernel mutexes cannot be used in any way from interrupt or softirq context, so the user_mutex which manages access to a call cannot be a mutex since on a new call the mutex must start off locked and be unlocked within the softirq handler to prevent userspace interfering with a call we're setting up. Commit a0855d24fc22d49cdc25664fb224caee16998683 ("locking/mutex: Complain upon mutex API misuse in IRQ contexts") causes big warnings to be splashed in dmesg for each a new call that comes in from the server. Whilst it *seems* like it should be okay, since the accept path uses trylock, there are issues with PI boosting and marking the wrong task as the owner. Fix this by not taking the mutex in the softirq path at all. It's not obvious that there should be any need for it as the state is set before the first notification is generated for the new call. There's also no particular reason why the link-assessing ping should be triggered inside the mutex. It's not actually transmitted there anyway, but rather it has to be deferred to a workqueue. Further, I don't think that there's any particular reason that the socket notification needs to be done from within rx->incoming_lock, so the amount of time that lock is held can be shortened too and the ping prepared before the new call notification is sent. Fixes: 540b1c48c37a ("rxrpc: Fix deadlock between call creation and sendmsg/recvmsg") Signed-off-by: David Howells cc: Peter Zijlstra (Intel) cc: Ingo Molnar cc: Will Deacon cc: Davidlohr Bueso --- net/rxrpc/call_accept.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 3685b1732f65..44fa22b020ef 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -381,18 +381,6 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, trace_rxrpc_receive(call, rxrpc_receive_incoming, sp->hdr.serial, sp->hdr.seq); - /* Lock the call to prevent rxrpc_kernel_send/recv_data() and - * sendmsg()/recvmsg() inconveniently stealing the mutex once the - * notification is generated. - * - * The BUG should never happen because the kernel should be well - * behaved enough not to access the call before the first notification - * event and userspace is prevented from doing so until the state is - * appropriate. - */ - if (!mutex_trylock(&call->user_mutex)) - BUG(); - /* Make the call live. */ rxrpc_incoming_call(rx, call, skb); conn = call->conn; @@ -433,6 +421,9 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, BUG(); } spin_unlock(&conn->state_lock); + spin_unlock(&rx->incoming_lock); + + rxrpc_send_ping(call, skb); if (call->state == RXRPC_CALL_SERVER_ACCEPTING) rxrpc_notify_socket(call); @@ -444,11 +435,6 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, */ rxrpc_put_call(call, rxrpc_call_put); - spin_unlock(&rx->incoming_lock); - - rxrpc_send_ping(call, skb); - mutex_unlock(&call->user_mutex); - _leave(" = %p{%d}", call, call->debug_id); return call; From 063c60d39180cec7c9317f5acfc3071f8fecd705 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Dec 2019 16:17:16 +0000 Subject: [PATCH 028/118] rxrpc: Fix missing security check on incoming calls Fix rxrpc_new_incoming_call() to check that we have a suitable service key available for the combination of service ID and security class of a new incoming call - and to reject calls for which we don't. This causes an assertion like the following to appear: rxrpc: Assertion failed - 6(0x6) == 12(0xc) is false kernel BUG at net/rxrpc/call_object.c:456! Where call->state is RXRPC_CALL_SERVER_SECURING (6) rather than RXRPC_CALL_COMPLETE (12). Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Reported-by: Marc Dionne Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 10 ++++-- net/rxrpc/call_accept.c | 14 ++++++-- net/rxrpc/conn_event.c | 16 +-------- net/rxrpc/conn_service.c | 4 +++ net/rxrpc/rxkad.c | 5 +-- net/rxrpc/security.c | 70 +++++++++++++++++++--------------------- 6 files changed, 59 insertions(+), 60 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7c7d10f2e0c1..5e99df80e80a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -209,6 +209,7 @@ struct rxrpc_skb_priv { struct rxrpc_security { const char *name; /* name of this service */ u8 security_index; /* security type provided */ + u32 no_key_abort; /* Abort code indicating no key */ /* Initialise a security service */ int (*init)(void); @@ -977,8 +978,9 @@ static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn, struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct sk_buff *); struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t); -void rxrpc_new_incoming_connection(struct rxrpc_sock *, - struct rxrpc_connection *, struct sk_buff *); +void rxrpc_new_incoming_connection(struct rxrpc_sock *, struct rxrpc_connection *, + const struct rxrpc_security *, struct key *, + struct sk_buff *); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* @@ -1103,7 +1105,9 @@ extern const struct rxrpc_security rxkad; int __init rxrpc_init_security(void); void rxrpc_exit_security(void); int rxrpc_init_client_conn_security(struct rxrpc_connection *); -int rxrpc_init_server_conn_security(struct rxrpc_connection *); +bool rxrpc_look_up_server_security(struct rxrpc_local *, struct rxrpc_sock *, + const struct rxrpc_security **, struct key **, + struct sk_buff *); /* * sendmsg.c diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 44fa22b020ef..70e44abf106c 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -263,6 +263,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_local *local, struct rxrpc_peer *peer, struct rxrpc_connection *conn, + const struct rxrpc_security *sec, + struct key *key, struct sk_buff *skb) { struct rxrpc_backlog *b = rx->backlog; @@ -310,7 +312,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, conn->params.local = rxrpc_get_local(local); conn->params.peer = peer; rxrpc_see_connection(conn); - rxrpc_new_incoming_connection(rx, conn, skb); + rxrpc_new_incoming_connection(rx, conn, sec, key, skb); } else { rxrpc_get_connection(conn); } @@ -349,9 +351,11 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + const struct rxrpc_security *sec = NULL; struct rxrpc_connection *conn; struct rxrpc_peer *peer = NULL; - struct rxrpc_call *call; + struct rxrpc_call *call = NULL; + struct key *key = NULL; _enter(""); @@ -372,7 +376,11 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, */ conn = rxrpc_find_connection_rcu(local, skb, &peer); - call = rxrpc_alloc_incoming_call(rx, local, peer, conn, skb); + if (!conn && !rxrpc_look_up_server_security(local, rx, &sec, &key, skb)) + goto no_call; + + call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, key, skb); + key_put(key); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; goto no_call; diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index a1ceef4f5cd0..808a4723f868 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -376,21 +376,7 @@ static void rxrpc_secure_connection(struct rxrpc_connection *conn) _enter("{%d}", conn->debug_id); ASSERT(conn->security_ix != 0); - - if (!conn->params.key) { - _debug("set up security"); - ret = rxrpc_init_server_conn_security(conn); - switch (ret) { - case 0: - break; - case -ENOENT: - abort_code = RX_CALL_DEAD; - goto abort; - default: - abort_code = RXKADNOAUTH; - goto abort; - } - } + ASSERT(conn->server_key); if (conn->security->issue_challenge(conn) < 0) { abort_code = RX_CALL_DEAD; diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 123d6ceab15c..21da48e3d2e5 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -148,6 +148,8 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn */ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, struct rxrpc_connection *conn, + const struct rxrpc_security *sec, + struct key *key, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -160,6 +162,8 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, conn->service_id = sp->hdr.serviceId; conn->security_ix = sp->hdr.securityIndex; conn->out_clientflag = 0; + conn->security = sec; + conn->server_key = key_get(key); if (conn->security_ix) conn->state = RXRPC_CONN_SERVICE_UNSECURED; else diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 8d8aa3c230b5..098f1f9ec53b 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -648,9 +648,9 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) u32 serial; int ret; - _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key)); + _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); - ret = key_validate(conn->params.key); + ret = key_validate(conn->server_key); if (ret < 0) return ret; @@ -1293,6 +1293,7 @@ static void rxkad_exit(void) const struct rxrpc_security rxkad = { .name = "rxkad", .security_index = RXRPC_SECURITY_RXKAD, + .no_key_abort = RXKADUNKNOWNKEY, .init = rxkad_init, .exit = rxkad_exit, .init_connection_security = rxkad_init_connection_security, diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index a4c47d2b7054..9b1fb9ed0717 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -101,62 +101,58 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) } /* - * initialise the security on a server connection + * Find the security key for a server connection. */ -int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) +bool rxrpc_look_up_server_security(struct rxrpc_local *local, struct rxrpc_sock *rx, + const struct rxrpc_security **_sec, + struct key **_key, + struct sk_buff *skb) { const struct rxrpc_security *sec; - struct rxrpc_local *local = conn->params.local; - struct rxrpc_sock *rx; - struct key *key; - key_ref_t kref; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + key_ref_t kref = NULL; char kdesc[5 + 1 + 3 + 1]; _enter(""); - sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix); + sprintf(kdesc, "%u:%u", sp->hdr.serviceId, sp->hdr.securityIndex); - sec = rxrpc_security_lookup(conn->security_ix); + sec = rxrpc_security_lookup(sp->hdr.securityIndex); if (!sec) { - _leave(" = -ENOKEY [lookup]"); - return -ENOKEY; + trace_rxrpc_abort(0, "SVS", + sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EKEYREJECTED); + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; + skb->priority = RX_INVALID_OPERATION; + return false; } - /* find the service */ - read_lock(&local->services_lock); - rx = rcu_dereference_protected(local->service, - lockdep_is_held(&local->services_lock)); - if (rx && (rx->srx.srx_service == conn->service_id || - rx->second_service == conn->service_id)) - goto found_service; + if (sp->hdr.securityIndex == RXRPC_SECURITY_NONE) + goto out; - /* the service appears to have died */ - read_unlock(&local->services_lock); - _leave(" = -ENOENT"); - return -ENOENT; - -found_service: if (!rx->securities) { - read_unlock(&local->services_lock); - _leave(" = -ENOKEY"); - return -ENOKEY; + trace_rxrpc_abort(0, "SVR", + sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EKEYREJECTED); + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; + skb->priority = RX_INVALID_OPERATION; + return false; } /* look through the service's keyring */ kref = keyring_search(make_key_ref(rx->securities, 1UL), &key_type_rxrpc_s, kdesc, true); if (IS_ERR(kref)) { - read_unlock(&local->services_lock); - _leave(" = %ld [search]", PTR_ERR(kref)); - return PTR_ERR(kref); + trace_rxrpc_abort(0, "SVK", + sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + sec->no_key_abort, EKEYREJECTED); + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; + skb->priority = sec->no_key_abort; + return false; } - key = key_ref_to_ptr(kref); - read_unlock(&local->services_lock); - - conn->server_key = key; - conn->security = sec; - - _leave(" = 0"); - return 0; +out: + *_sec = sec; + *_key = key_ref_to_ptr(kref); + return true; } From 9adbd45d6d32ffc1a03f3c51d72cfc69ebfc2ddb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2019 08:45:55 -0700 Subject: [PATCH 029/118] io_uring: add and use struct io_rw for read/writes Put the kiocb in struct io_rw, and add the addr/len for the request as well. Use the kiocb->private field for the buffer index for fixed reads and writes. Any use of kiocb->ki_filp is flipped to req->file. It's the same thing, and less confusing. Signed-off-by: Jens Axboe --- fs/io_uring.c | 96 +++++++++++++++++++++++++++------------------------ 1 file changed, 50 insertions(+), 46 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7a23d2351be2..b5f91d21fd04 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -332,6 +332,13 @@ struct io_timeout { int flags; }; +struct io_rw { + /* NOTE: kiocb has the file as the first member, so don't do it here */ + struct kiocb kiocb; + u64 addr; + u64 len; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -369,7 +376,7 @@ struct io_async_ctx { struct io_kiocb { union { struct file *file; - struct kiocb rw; + struct io_rw rw; struct io_poll_iocb poll; struct io_accept accept; struct io_sync sync; @@ -1180,7 +1187,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, ret = 0; list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; /* * Move completed entries to our local list. If we find a @@ -1335,7 +1342,7 @@ static inline void req_set_fail_links(struct io_kiocb *req) static void io_complete_rw_common(struct kiocb *kiocb, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); @@ -1347,7 +1354,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); io_complete_rw_common(kiocb, res); io_put_req(req); @@ -1355,7 +1362,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); struct io_kiocb *nxt = NULL; io_complete_rw_common(kiocb, res); @@ -1366,7 +1373,7 @@ static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); @@ -1400,7 +1407,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, list); - if (list_req->rw.ki_filp != req->rw.ki_filp) + if (list_req->file != req->file) ctx->poll_multi_file = true; } @@ -1475,7 +1482,7 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) { const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; unsigned ioprio; int ret; @@ -1524,6 +1531,12 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) return -EINVAL; kiocb->ki_complete = io_complete_rw; } + + req->rw.addr = READ_ONCE(req->sqe->addr); + req->rw.len = READ_ONCE(req->sqe->len); + /* we own ->private, reuse it for the buffer index */ + req->rw.kiocb.private = (void *) (unsigned long) + READ_ONCE(req->sqe->buf_index); return 0; } @@ -1557,11 +1570,11 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, io_rw_done(kiocb, ret); } -static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, - const struct io_uring_sqe *sqe, +static ssize_t io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) { - size_t len = READ_ONCE(sqe->len); + struct io_ring_ctx *ctx = req->ctx; + size_t len = req->rw.len; struct io_mapped_ubuf *imu; unsigned index, buf_index; size_t offset; @@ -1571,13 +1584,13 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, if (unlikely(!ctx->user_bufs)) return -EFAULT; - buf_index = READ_ONCE(sqe->buf_index); + buf_index = (unsigned long) req->rw.kiocb.private; if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = &ctx->user_bufs[index]; - buf_addr = READ_ONCE(sqe->addr); + buf_addr = req->rw.addr; /* overflow */ if (buf_addr + len < buf_addr) @@ -1634,25 +1647,20 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter) { - const struct io_uring_sqe *sqe = req->sqe; - void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - size_t sqe_len = READ_ONCE(sqe->len); + void __user *buf = u64_to_user_ptr(req->rw.addr); + size_t sqe_len = req->rw.len; u8 opcode; - /* - * We're reading ->opcode for the second time, but the first read - * doesn't care whether it's _FIXED or not, so it doesn't matter - * whether ->opcode changes concurrently. The first read does care - * about whether it is a READ or a WRITE, so we don't trust this read - * for that purpose and instead let the caller pass in the read/write - * flag. - */ opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; - return io_import_fixed(req->ctx, rw, sqe, iter); + return io_import_fixed(req, rw, iter); } + /* buffer index only valid with fixed read/write */ + if (req->rw.kiocb.private) + return -EINVAL; + if (req->io) { struct io_async_rw *iorw = &req->io->rw; @@ -1801,9 +1809,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; - struct file *file; size_t iov_count; ssize_t io_size, ret; @@ -1819,9 +1826,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, /* Ensure we clear previously set non-block flag */ if (!force_nonblock) - req->rw.ki_flags &= ~IOCB_NOWAIT; + req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - file = req->file; io_size = ret; if (req->flags & REQ_F_LINK) req->result = io_size; @@ -1830,20 +1836,20 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(file)) { + if (force_nonblock && !io_file_supports_async(req->file)) { req->flags |= REQ_F_MUST_PUNT; goto copy_iov; } iov_count = iov_iter_count(&iter); - ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; - if (file->f_op->read_iter) - ret2 = call_read_iter(file, kiocb, &iter); + if (req->file->f_op->read_iter) + ret2 = call_read_iter(req->file, kiocb, &iter); else - ret2 = loop_rw_iter(READ, file, kiocb, &iter); + ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); /* * In case of a short read, punt to async. This can happen @@ -1894,9 +1900,8 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; - struct file *file; size_t iov_count; ssize_t ret, io_size; @@ -1912,9 +1917,8 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, /* Ensure we clear previously set non-block flag */ if (!force_nonblock) - req->rw.ki_flags &= ~IOCB_NOWAIT; + req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - file = kiocb->ki_filp; io_size = ret; if (req->flags & REQ_F_LINK) req->result = io_size; @@ -1934,7 +1938,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, goto copy_iov; iov_count = iov_iter_count(&iter); - ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; @@ -1946,17 +1950,17 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, * we return to userspace. */ if (req->flags & REQ_F_ISREG) { - __sb_start_write(file_inode(file)->i_sb, + __sb_start_write(file_inode(req->file)->i_sb, SB_FREEZE_WRITE, true); - __sb_writers_release(file_inode(file)->i_sb, + __sb_writers_release(file_inode(req->file)->i_sb, SB_FREEZE_WRITE); } kiocb->ki_flags |= IOCB_WRITE; - if (file->f_op->write_iter) - ret2 = call_write_iter(file, kiocb, &iter); + if (req->file->f_op->write_iter) + ret2 = call_write_iter(req->file, kiocb, &iter); else - ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); + ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); } else { @@ -2036,7 +2040,7 @@ static void io_fsync_finish(struct io_wq_work **workptr) if (io_req_cancelled(req)) return; - ret = vfs_fsync_range(req->rw.ki_filp, req->sync.off, + ret = vfs_fsync_range(req->file, req->sync.off, end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) @@ -2102,7 +2106,7 @@ static void io_sync_file_range_finish(struct io_wq_work **workptr) if (io_req_cancelled(req)) return; - ret = sync_file_range(req->rw.ki_filp, req->sync.off, req->sync.len, + ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); if (ret < 0) req_set_fail_links(req); From 3fbb51c18f5c15a23db74c4da79d3d035176c480 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2019 08:51:52 -0700 Subject: [PATCH 030/118] io_uring: move all prep state for IORING_OP_CONNECT to prep handler Add struct io_connect in our io_kiocb per-command union, and ensure that io_connect_prep() has grabbed what it needs from the SQE. Signed-off-by: Jens Axboe --- fs/io_uring.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b5f91d21fd04..2a173f54ec8e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -339,6 +339,12 @@ struct io_rw { u64 len; }; +struct io_connect { + struct file *file; + struct sockaddr __user *addr; + int addr_len; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -382,6 +388,7 @@ struct io_kiocb { struct io_sync sync; struct io_cancel cancel; struct io_timeout timeout; + struct io_connect connect; }; const struct io_uring_sqe *sqe; @@ -2406,14 +2413,18 @@ static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) const struct io_uring_sqe *sqe = req->sqe; - struct sockaddr __user *addr; - int addr_len; - addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - addr_len = READ_ONCE(sqe->addr2); - return move_addr_to_kernel(addr, addr_len, &io->connect.address); + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; + if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) + return -EINVAL; + + req->connect.addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->connect.addr_len = READ_ONCE(sqe->addr2); + return move_addr_to_kernel(req->connect.addr, req->connect.addr_len, + &io->connect.address); #else - return 0; + return -EOPNOTSUPP; #endif } @@ -2421,18 +2432,9 @@ static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; struct io_async_ctx __io, *io; unsigned file_flags; - int addr_len, ret; - - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) - return -EINVAL; - - addr_len = READ_ONCE(sqe->addr2); - file_flags = force_nonblock ? O_NONBLOCK : 0; + int ret; if (req->io) { io = req->io; @@ -2443,8 +2445,10 @@ static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, io = &__io; } - ret = __sys_connect_file(req->file, &io->connect.address, addr_len, - file_flags); + file_flags = force_nonblock ? O_NONBLOCK : 0; + + ret = __sys_connect_file(req->file, &io->connect.address, + req->connect.addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { if (req->io) return -EAGAIN; From e47293fdf98998292a89d516c8f7b8b9eb5c5213 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2019 08:58:21 -0700 Subject: [PATCH 031/118] io_uring: move all prep state for IORING_OP_{SEND,RECV}_MGS to prep handler Add struct io_sr_msg in our io_kiocb per-command union, and ensure that the send/recvmsg prep handlers have grabbed what they need from the SQE by the time prep is done. Signed-off-by: Jens Axboe --- fs/io_uring.c | 64 ++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2a173f54ec8e..89e5b19044cc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -345,6 +345,12 @@ struct io_connect { int addr_len; }; +struct io_sr_msg { + struct file *file; + struct user_msghdr __user *msg; + int msg_flags; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -389,6 +395,7 @@ struct io_kiocb { struct io_cancel cancel; struct io_timeout timeout; struct io_connect connect; + struct io_sr_msg sr_msg; }; const struct io_uring_sqe *sqe; @@ -2164,15 +2171,15 @@ static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) const struct io_uring_sqe *sqe = req->sqe; - struct user_msghdr __user *msg; - unsigned flags; + struct io_sr_msg *sr = &req->sr_msg; - flags = READ_ONCE(sqe->msg_flags); - msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->msg_flags = READ_ONCE(sqe->msg_flags); + sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); io->msg.iov = io->msg.fast_iov; - return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); + return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + &io->msg.iov); #else - return 0; + return -EOPNOTSUPP; #endif } @@ -2180,7 +2187,6 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -2194,12 +2200,6 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, struct sockaddr_storage addr; unsigned flags; - flags = READ_ONCE(sqe->msg_flags); - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - if (req->io) { kmsg = &req->io->msg; kmsg->msg.msg_name = &addr; @@ -2215,6 +2215,12 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, goto out; } + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (force_nonblock && ret == -EAGAIN) { if (req->io) @@ -2245,17 +2251,15 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; - struct user_msghdr __user *msg; - unsigned flags; + struct io_sr_msg *sr = &req->sr_msg; - flags = READ_ONCE(sqe->msg_flags); - msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->msg_flags = READ_ONCE(req->sqe->msg_flags); + sr->msg = u64_to_user_ptr(READ_ONCE(req->sqe->addr)); io->msg.iov = io->msg.fast_iov; - return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, - &io->msg.iov); + return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + &io->msg.uaddr, &io->msg.iov); #else - return 0; + return -EOPNOTSUPP; #endif } @@ -2263,7 +2267,6 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -2273,18 +2276,10 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, sock = sock_from_file(req->file, &ret); if (sock) { - struct user_msghdr __user *msg; struct io_async_ctx io; struct sockaddr_storage addr; unsigned flags; - flags = READ_ONCE(sqe->msg_flags); - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); if (req->io) { kmsg = &req->io->msg; kmsg->msg.msg_name = &addr; @@ -2300,7 +2295,14 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, goto out; } - ret = __sys_recvmsg_sock(sock, &kmsg->msg, msg, kmsg->uaddr, flags); + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, + kmsg->uaddr, flags); if (force_nonblock && ret == -EAGAIN) { if (req->io) return -EAGAIN; From 26a61679f10c6f041726411964b172565021c2eb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2019 09:02:01 -0700 Subject: [PATCH 032/118] io_uring: read 'count' for IORING_OP_TIMEOUT in prep handler Add the count field to struct io_timeout, and ensure the prep handler has read it. Timeout also needs an async context always, set it up in the prep handler if we don't have one. Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 89e5b19044cc..7e8d28750053 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -330,6 +330,7 @@ struct io_timeout { struct file *file; u64 addr; int flags; + unsigned count; }; struct io_rw { @@ -2902,7 +2903,12 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - data = &io->timeout; + req->timeout.count = READ_ONCE(sqe->off); + + if (!io && io_alloc_async_ctx(req)) + return -ENOMEM; + + data = &req->io->timeout; data->req = req; req->flags |= REQ_F_TIMEOUT; @@ -2920,7 +2926,6 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, static int io_timeout(struct io_kiocb *req) { - const struct io_uring_sqe *sqe = req->sqe; unsigned count; struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data; @@ -2942,7 +2947,7 @@ static int io_timeout(struct io_kiocb *req) * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - count = READ_ONCE(sqe->off); + count = req->timeout.count; if (!count) { req->flags |= REQ_F_TIMEOUT_NOSEQ; spin_lock_irq(&ctx->completion_lock); From 06b76d44ba25e52711dc7cc4fc75b50907bc6b8e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Dec 2019 14:44:26 -0700 Subject: [PATCH 033/118] io_uring: standardize the prep methods We currently have a mix of use cases. Most of the newer ones are pretty uniform, but we have some older ones that use different calling calling conventions. This is confusing. For the opcodes that currently rely on the req->io->sqe copy saving them from reuse, add a request type struct in the io_kiocb command union to store the data they need. Prepare for all opcodes having a standard prep method, so we can call it in a uniform fashion and outside of the opcode handler. This is in preparation for passing in the 'sqe' pointer, rather than storing it in the io_kiocb. Once we have uniform prep handlers, we can leave all the prep work to that part, and not even pass in the sqe to the opcode handler. This ensures that we don't reuse sqe data inadvertently. Signed-off-by: Jens Axboe --- fs/io_uring.c | 128 +++++++++++++++++++++++++------------------------- 1 file changed, 63 insertions(+), 65 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7e8d28750053..2cdfbb451fe2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -371,7 +371,6 @@ struct io_async_rw { }; struct io_async_ctx { - struct io_uring_sqe sqe; union { struct io_async_rw rw; struct io_async_msghdr msg; @@ -433,7 +432,6 @@ struct io_kiocb { #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ #define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ -#define REQ_F_PREPPED 131072 /* request already opcode prepared */ u64 user_data; u32 result; u32 sequence; @@ -1501,6 +1499,8 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) unsigned ioprio; int ret; + if (!sqe) + return 0; if (!req->file) return -EBADF; @@ -1552,6 +1552,7 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) /* we own ->private, reuse it for the buffer index */ req->rw.kiocb.private = (void *) (unsigned long) READ_ONCE(req->sqe->buf_index); + req->sqe = NULL; return 0; } @@ -1773,13 +1774,7 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, static int io_alloc_async_ctx(struct io_kiocb *req) { req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); - if (req->io) { - memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); - req->sqe = &req->io->sqe; - return 0; - } - - return 1; + return req->io == NULL; } static void io_rw_async(struct io_wq_work **workptr) @@ -1810,12 +1805,14 @@ static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, { ssize_t ret; - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + if (req->sqe) { + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; - if (unlikely(!(req->file->f_mode & FMODE_READ))) - return -EBADF; + if (unlikely(!(req->file->f_mode & FMODE_READ))) + return -EBADF; + } return io_import_iovec(READ, req, iovec, iter); } @@ -1829,15 +1826,9 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t io_size, ret; - if (!req->io) { - ret = io_read_prep(req, &iovec, &iter, force_nonblock); - if (ret < 0) - return ret; - } else { - ret = io_import_iovec(READ, req, &iovec, &iter); - if (ret < 0) - return ret; - } + ret = io_read_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) @@ -1901,12 +1892,14 @@ static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, { ssize_t ret; - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + if (req->sqe) { + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; - if (unlikely(!(req->file->f_mode & FMODE_WRITE))) - return -EBADF; + if (unlikely(!(req->file->f_mode & FMODE_WRITE))) + return -EBADF; + } return io_import_iovec(WRITE, req, iovec, iter); } @@ -1920,15 +1913,9 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t ret, io_size; - if (!req->io) { - ret = io_write_prep(req, &iovec, &iter, force_nonblock); - if (ret < 0) - return ret; - } else { - ret = io_import_iovec(WRITE, req, &iovec, &iter); - if (ret < 0) - return ret; - } + ret = io_write_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) @@ -2013,7 +2000,7 @@ static int io_prep_fsync(struct io_kiocb *req) const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_PREPPED) + if (!req->sqe) return 0; if (!req->file) return -EBADF; @@ -2029,7 +2016,7 @@ static int io_prep_fsync(struct io_kiocb *req) req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->len); - req->flags |= REQ_F_PREPPED; + req->sqe = NULL; return 0; } @@ -2095,7 +2082,7 @@ static int io_prep_sfr(struct io_kiocb *req) const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_PREPPED) + if (!sqe) return 0; if (!req->file) return -EBADF; @@ -2108,7 +2095,7 @@ static int io_prep_sfr(struct io_kiocb *req) req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->len); req->sync.flags = READ_ONCE(sqe->sync_range_flags); - req->flags |= REQ_F_PREPPED; + req->sqe = NULL; return 0; } @@ -2173,12 +2160,17 @@ static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) #if defined(CONFIG_NET) const struct io_uring_sqe *sqe = req->sqe; struct io_sr_msg *sr = &req->sr_msg; + int ret; + if (!sqe) + return 0; sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); io->msg.iov = io->msg.fast_iov; - return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.iov); + req->sqe = NULL; + return ret; #else return -EOPNOTSUPP; #endif @@ -2253,12 +2245,18 @@ static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; + int ret; + + if (!req->sqe) + return 0; sr->msg_flags = READ_ONCE(req->sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(req->sqe->addr)); io->msg.iov = io->msg.fast_iov; - return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.uaddr, &io->msg.iov); + req->sqe = NULL; + return ret; #else return -EOPNOTSUPP; #endif @@ -2336,7 +2334,7 @@ static int io_accept_prep(struct io_kiocb *req) const struct io_uring_sqe *sqe = req->sqe; struct io_accept *accept = &req->accept; - if (req->flags & REQ_F_PREPPED) + if (!req->sqe) return 0; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) @@ -2347,7 +2345,7 @@ static int io_accept_prep(struct io_kiocb *req) accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); - req->flags |= REQ_F_PREPPED; + req->sqe = NULL; return 0; #else return -EOPNOTSUPP; @@ -2416,7 +2414,10 @@ static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) const struct io_uring_sqe *sqe = req->sqe; + int ret; + if (!sqe) + return 0; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) @@ -2424,8 +2425,10 @@ static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) req->connect.addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); req->connect.addr_len = READ_ONCE(sqe->addr2); - return move_addr_to_kernel(req->connect.addr, req->connect.addr_len, + ret = move_addr_to_kernel(req->connect.addr, req->connect.addr_len, &io->connect.address); + req->sqe = NULL; + return ret; #else return -EOPNOTSUPP; #endif @@ -2526,7 +2529,7 @@ static int io_poll_remove_prep(struct io_kiocb *req) { const struct io_uring_sqe *sqe = req->sqe; - if (req->flags & REQ_F_PREPPED) + if (!sqe) return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -2535,7 +2538,7 @@ static int io_poll_remove_prep(struct io_kiocb *req) return -EINVAL; req->poll.addr = READ_ONCE(sqe->addr); - req->flags |= REQ_F_PREPPED; + req->sqe = NULL; return 0; } @@ -2696,7 +2699,7 @@ static int io_poll_add_prep(struct io_kiocb *req) struct io_poll_iocb *poll = &req->poll; u16 events; - if (req->flags & REQ_F_PREPPED) + if (!sqe) return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -2705,9 +2708,9 @@ static int io_poll_add_prep(struct io_kiocb *req) if (!poll->file) return -EBADF; - req->flags |= REQ_F_PREPPED; events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + req->sqe = NULL; return 0; } @@ -2845,7 +2848,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req) { const struct io_uring_sqe *sqe = req->sqe; - if (req->flags & REQ_F_PREPPED) + if (!sqe) return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -2857,7 +2860,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req) if (req->timeout.flags) return -EINVAL; - req->flags |= REQ_F_PREPPED; + req->sqe = NULL; return 0; } @@ -2893,6 +2896,8 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, struct io_timeout_data *data; unsigned flags; + if (!sqe) + return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->buf_index || sqe->len != 1) @@ -2921,6 +2926,7 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, data->mode = HRTIMER_MODE_REL; hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); + req->sqe = NULL; return 0; } @@ -2933,13 +2939,9 @@ static int io_timeout(struct io_kiocb *req) unsigned span = 0; int ret; - if (!req->io) { - if (io_alloc_async_ctx(req)) - return -ENOMEM; - ret = io_timeout_prep(req, req->io, false); - if (ret) - return ret; - } + ret = io_timeout_prep(req, req->io, false); + if (ret) + return ret; data = &req->io->timeout; /* @@ -3069,7 +3071,7 @@ static int io_async_cancel_prep(struct io_kiocb *req) { const struct io_uring_sqe *sqe = req->sqe; - if (req->flags & REQ_F_PREPPED) + if (!sqe) return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3077,8 +3079,8 @@ static int io_async_cancel_prep(struct io_kiocb *req) sqe->cancel_flags) return -EINVAL; - req->flags |= REQ_F_PREPPED; req->cancel.addr = READ_ONCE(sqe->addr); + req->sqe = NULL; return 0; } @@ -3213,13 +3215,9 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, ret = io_nop(req); break; case IORING_OP_READV: - if (unlikely(req->sqe->buf_index)) - return -EINVAL; ret = io_read(req, nxt, force_nonblock); break; case IORING_OP_WRITEV: - if (unlikely(req->sqe->buf_index)) - return -EINVAL; ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_READ_FIXED: From 3529d8c2b353e6e446277ae96a36e7471cb070fc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Dec 2019 18:24:38 -0700 Subject: [PATCH 034/118] io_uring: pass in 'sqe' to the prep handlers This moves the prep handlers outside of the opcode handlers, and allows us to pass in the sqe directly. If the sqe is non-NULL, it means that the request should be prepared for the first time. With the opcode handlers not having access to the sqe at all, we are guaranteed that the prep handler has setup the request fully by the time we get there. As before, for opcodes that need to copy in more data then the io_kiocb allows for, the io_async_ctx holds that info. If a prep handler is invoked with req->io set, it must use that to retain information for later. Finally, we can remove io_kiocb->sqe as well. Signed-off-by: Jens Axboe --- fs/io_uring.c | 493 +++++++++++++++++++++++++------------------------- 1 file changed, 251 insertions(+), 242 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2cdfbb451fe2..562e3a1a1bf9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -398,7 +398,6 @@ struct io_kiocb { struct io_sr_msg sr_msg; }; - const struct io_uring_sqe *sqe; struct io_async_ctx *io; struct file *ring_file; int ring_fd; @@ -629,33 +628,31 @@ static inline bool io_prep_async_work(struct io_kiocb *req, { bool do_hashed = false; - if (req->sqe) { - switch (req->opcode) { - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - /* only regular files should be hashed for writes */ - if (req->flags & REQ_F_ISREG) - do_hashed = true; - /* fall-through */ - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_SENDMSG: - case IORING_OP_RECVMSG: - case IORING_OP_ACCEPT: - case IORING_OP_POLL_ADD: - case IORING_OP_CONNECT: - /* - * We know REQ_F_ISREG is not set on some of these - * opcodes, but this enables us to keep the check in - * just one place. - */ - if (!(req->flags & REQ_F_ISREG)) - req->work.flags |= IO_WQ_WORK_UNBOUND; - break; - } - if (io_req_needs_user(req)) - req->work.flags |= IO_WQ_WORK_NEEDS_USER; + switch (req->opcode) { + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + /* only regular files should be hashed for writes */ + if (req->flags & REQ_F_ISREG) + do_hashed = true; + /* fall-through */ + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_SENDMSG: + case IORING_OP_RECVMSG: + case IORING_OP_ACCEPT: + case IORING_OP_POLL_ADD: + case IORING_OP_CONNECT: + /* + * We know REQ_F_ISREG is not set on some of these + * opcodes, but this enables us to keep the check in + * just one place. + */ + if (!(req->flags & REQ_F_ISREG)) + req->work.flags |= IO_WQ_WORK_UNBOUND; + break; } + if (io_req_needs_user(req)) + req->work.flags |= IO_WQ_WORK_NEEDS_USER; *link = io_prep_linked_timeout(req); return do_hashed; @@ -1491,16 +1488,14 @@ static bool io_file_supports_async(struct file *file) return false; } -static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) { - const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw.kiocb; unsigned ioprio; int ret; - if (!sqe) - return 0; if (!req->file) return -EBADF; @@ -1547,12 +1542,11 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) kiocb->ki_complete = io_complete_rw; } - req->rw.addr = READ_ONCE(req->sqe->addr); - req->rw.len = READ_ONCE(req->sqe->len); + req->rw.addr = READ_ONCE(sqe->addr); + req->rw.len = READ_ONCE(sqe->len); /* we own ->private, reuse it for the buffer index */ req->rw.kiocb.private = (void *) (unsigned long) - READ_ONCE(req->sqe->buf_index); - req->sqe = NULL; + READ_ONCE(sqe->buf_index); return 0; } @@ -1800,21 +1794,33 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, return 0; } -static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, - struct iov_iter *iter, bool force_nonblock) +static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) { + struct io_async_ctx *io; + struct iov_iter iter; ssize_t ret; - if (req->sqe) { - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + ret = io_prep_rw(req, sqe, force_nonblock); + if (ret) + return ret; - if (unlikely(!(req->file->f_mode & FMODE_READ))) - return -EBADF; - } + if (unlikely(!(req->file->f_mode & FMODE_READ))) + return -EBADF; - return io_import_iovec(READ, req, iovec, iter); + if (!req->io) + return 0; + + io = req->io; + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(READ, req, &io->rw.iov, &iter); + req->io = io; + if (ret < 0) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; } static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, @@ -1826,7 +1832,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t io_size, ret; - ret = io_read_prep(req, &iovec, &iter, force_nonblock); + ret = io_import_iovec(READ, req, &iovec, &iter); if (ret < 0) return ret; @@ -1887,21 +1893,33 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, return ret; } -static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, - struct iov_iter *iter, bool force_nonblock) +static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) { + struct io_async_ctx *io; + struct iov_iter iter; ssize_t ret; - if (req->sqe) { - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + ret = io_prep_rw(req, sqe, force_nonblock); + if (ret) + return ret; - if (unlikely(!(req->file->f_mode & FMODE_WRITE))) - return -EBADF; - } + if (unlikely(!(req->file->f_mode & FMODE_WRITE))) + return -EBADF; - return io_import_iovec(WRITE, req, iovec, iter); + if (!req->io) + return 0; + + io = req->io; + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter); + req->io = io; + if (ret < 0) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; } static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, @@ -1913,7 +1931,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t ret, io_size; - ret = io_write_prep(req, &iovec, &iter, force_nonblock); + ret = io_import_iovec(WRITE, req, &iovec, &iter); if (ret < 0) return ret; @@ -1995,13 +2013,10 @@ static int io_nop(struct io_kiocb *req) return 0; } -static int io_prep_fsync(struct io_kiocb *req) +static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - if (!req->sqe) - return 0; if (!req->file) return -EBADF; @@ -2016,7 +2031,6 @@ static int io_prep_fsync(struct io_kiocb *req) req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->len); - req->sqe = NULL; return 0; } @@ -2057,11 +2071,6 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct io_wq_work *work, *old_work; - int ret; - - ret = io_prep_fsync(req); - if (ret) - return ret; /* fsync always requires a blocking context */ if (force_nonblock) { @@ -2077,13 +2086,10 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } -static int io_prep_sfr(struct io_kiocb *req) +static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - if (!sqe) - return 0; if (!req->file) return -EBADF; @@ -2095,7 +2101,6 @@ static int io_prep_sfr(struct io_kiocb *req) req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->len); req->sync.flags = READ_ONCE(sqe->sync_range_flags); - req->sqe = NULL; return 0; } @@ -2122,11 +2127,6 @@ static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct io_wq_work *work, *old_work; - int ret; - - ret = io_prep_sfr(req); - if (ret) - return ret; /* sync_file_range always requires a blocking context */ if (force_nonblock) { @@ -2155,22 +2155,21 @@ static void io_sendrecv_async(struct io_wq_work **workptr) } #endif -static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; struct io_sr_msg *sr = &req->sr_msg; - int ret; + struct io_async_ctx *io = req->io; - if (!sqe) - return 0; sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + + if (!io) + return 0; + io->msg.iov = io->msg.fast_iov; - ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.iov); - req->sqe = NULL; - return ret; #else return -EOPNOTSUPP; #endif @@ -2201,11 +2200,16 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, kmsg->iov = kmsg->fast_iov; kmsg->msg.msg_iter.iov = kmsg->iov; } else { + struct io_sr_msg *sr = &req->sr_msg; + kmsg = &io.msg; kmsg->msg.msg_name = &addr; - ret = io_sendmsg_prep(req, &io); + + io.msg.iov = io.msg.fast_iov; + ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, + sr->msg_flags, &io.msg.iov); if (ret) - goto out; + return ret; } flags = req->sr_msg.msg_flags; @@ -2228,7 +2232,6 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, ret = -EINTR; } -out: if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); io_cqring_add_event(req, ret); @@ -2241,22 +2244,22 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, #endif } -static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_recvmsg_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; - int ret; + struct io_async_ctx *io = req->io; - if (!req->sqe) + sr->msg_flags = READ_ONCE(sqe->msg_flags); + sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + + if (!io) return 0; - sr->msg_flags = READ_ONCE(req->sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(req->sqe->addr)); io->msg.iov = io->msg.fast_iov; - ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.uaddr, &io->msg.iov); - req->sqe = NULL; - return ret; #else return -EOPNOTSUPP; #endif @@ -2287,11 +2290,17 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, kmsg->iov = kmsg->fast_iov; kmsg->msg.msg_iter.iov = kmsg->iov; } else { + struct io_sr_msg *sr = &req->sr_msg; + kmsg = &io.msg; kmsg->msg.msg_name = &addr; - ret = io_recvmsg_prep(req, &io); + + io.msg.iov = io.msg.fast_iov; + ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg, + sr->msg_flags, &io.msg.uaddr, + &io.msg.iov); if (ret) - goto out; + return ret; } flags = req->sr_msg.msg_flags; @@ -2315,7 +2324,6 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, ret = -EINTR; } -out: if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); io_cqring_add_event(req, ret); @@ -2328,15 +2336,11 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, #endif } -static int io_accept_prep(struct io_kiocb *req) +static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; struct io_accept *accept = &req->accept; - if (!req->sqe) - return 0; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index) @@ -2345,7 +2349,6 @@ static int io_accept_prep(struct io_kiocb *req) accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); - req->sqe = NULL; return 0; #else return -EOPNOTSUPP; @@ -2393,10 +2396,6 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, #if defined(CONFIG_NET) int ret; - ret = io_accept_prep(req); - if (ret) - return ret; - ret = __io_accept(req, nxt, force_nonblock); if (ret == -EAGAIN && force_nonblock) { req->work.func = io_accept_finish; @@ -2410,25 +2409,25 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, #endif } -static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; - int ret; + struct io_connect *conn = &req->connect; + struct io_async_ctx *io = req->io; - if (!sqe) - return 0; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) return -EINVAL; - req->connect.addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->connect.addr_len = READ_ONCE(sqe->addr2); - ret = move_addr_to_kernel(req->connect.addr, req->connect.addr_len, + conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + conn->addr_len = READ_ONCE(sqe->addr2); + + if (!io) + return 0; + + return move_addr_to_kernel(conn->addr, conn->addr_len, &io->connect.address); - req->sqe = NULL; - return ret; #else return -EOPNOTSUPP; #endif @@ -2445,7 +2444,9 @@ static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, if (req->io) { io = req->io; } else { - ret = io_connect_prep(req, &__io); + ret = move_addr_to_kernel(req->connect.addr, + req->connect.addr_len, + &__io.connect.address); if (ret) goto out; io = &__io; @@ -2525,12 +2526,9 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) return -ENOENT; } -static int io_poll_remove_prep(struct io_kiocb *req) +static int io_poll_remove_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - const struct io_uring_sqe *sqe = req->sqe; - - if (!sqe) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || @@ -2538,7 +2536,6 @@ static int io_poll_remove_prep(struct io_kiocb *req) return -EINVAL; req->poll.addr = READ_ONCE(sqe->addr); - req->sqe = NULL; return 0; } @@ -2552,10 +2549,6 @@ static int io_poll_remove(struct io_kiocb *req) u64 addr; int ret; - ret = io_poll_remove_prep(req); - if (ret) - return ret; - addr = req->poll.addr; spin_lock_irq(&ctx->completion_lock); ret = io_poll_cancel(ctx, addr); @@ -2693,14 +2686,11 @@ static void io_poll_req_insert(struct io_kiocb *req) hlist_add_head(&req->hash_node, list); } -static int io_poll_add_prep(struct io_kiocb *req) +static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - const struct io_uring_sqe *sqe = req->sqe; struct io_poll_iocb *poll = &req->poll; u16 events; - if (!sqe) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) @@ -2710,7 +2700,6 @@ static int io_poll_add_prep(struct io_kiocb *req) events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - req->sqe = NULL; return 0; } @@ -2721,11 +2710,6 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) struct io_poll_table ipt; bool cancel = false; __poll_t mask; - int ret; - - ret = io_poll_add_prep(req); - if (ret) - return ret; INIT_IO_WORK(&req->work, io_poll_complete_work); INIT_HLIST_NODE(&req->hash_node); @@ -2844,12 +2828,9 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return 0; } -static int io_timeout_remove_prep(struct io_kiocb *req) +static int io_timeout_remove_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - const struct io_uring_sqe *sqe = req->sqe; - - if (!sqe) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) @@ -2860,7 +2841,6 @@ static int io_timeout_remove_prep(struct io_kiocb *req) if (req->timeout.flags) return -EINVAL; - req->sqe = NULL; return 0; } @@ -2872,10 +2852,6 @@ static int io_timeout_remove(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_timeout_remove_prep(req); - if (ret) - return ret; - spin_lock_irq(&ctx->completion_lock); ret = io_timeout_cancel(ctx, req->timeout.addr); @@ -2889,15 +2865,12 @@ static int io_timeout_remove(struct io_kiocb *req) return 0; } -static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, +static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool is_timeout_link) { - const struct io_uring_sqe *sqe = req->sqe; struct io_timeout_data *data; unsigned flags; - if (!sqe) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->buf_index || sqe->len != 1) @@ -2910,7 +2883,7 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, req->timeout.count = READ_ONCE(sqe->off); - if (!io && io_alloc_async_ctx(req)) + if (!req->io && io_alloc_async_ctx(req)) return -ENOMEM; data = &req->io->timeout; @@ -2926,7 +2899,6 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, data->mode = HRTIMER_MODE_REL; hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - req->sqe = NULL; return 0; } @@ -2937,11 +2909,7 @@ static int io_timeout(struct io_kiocb *req) struct io_timeout_data *data; struct list_head *entry; unsigned span = 0; - int ret; - ret = io_timeout_prep(req, req->io, false); - if (ret) - return ret; data = &req->io->timeout; /* @@ -3067,12 +3035,9 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, io_put_req_find_next(req, nxt); } -static int io_async_cancel_prep(struct io_kiocb *req) +static int io_async_cancel_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - const struct io_uring_sqe *sqe = req->sqe; - - if (!sqe) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || @@ -3080,28 +3045,20 @@ static int io_async_cancel_prep(struct io_kiocb *req) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); - req->sqe = NULL; return 0; } static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) { struct io_ring_ctx *ctx = req->ctx; - int ret; - - ret = io_async_cancel_prep(req); - if (ret) - return ret; io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); return 0; } -static int io_req_defer_prep(struct io_kiocb *req) +static int io_req_defer_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - struct io_async_ctx *io = req->io; - struct iov_iter iter; ssize_t ret = 0; switch (req->opcode) { @@ -3109,61 +3066,47 @@ static int io_req_defer_prep(struct io_kiocb *req) break; case IORING_OP_READV: case IORING_OP_READ_FIXED: - /* ensure prep does right import */ - req->io = NULL; - ret = io_read_prep(req, &iovec, &iter, true); - req->io = io; - if (ret < 0) - break; - io_req_map_rw(req, ret, iovec, inline_vecs, &iter); - ret = 0; + ret = io_read_prep(req, sqe, true); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - /* ensure prep does right import */ - req->io = NULL; - ret = io_write_prep(req, &iovec, &iter, true); - req->io = io; - if (ret < 0) - break; - io_req_map_rw(req, ret, iovec, inline_vecs, &iter); - ret = 0; + ret = io_write_prep(req, sqe, true); break; case IORING_OP_POLL_ADD: - ret = io_poll_add_prep(req); + ret = io_poll_add_prep(req, sqe); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove_prep(req); + ret = io_poll_remove_prep(req, sqe); break; case IORING_OP_FSYNC: - ret = io_prep_fsync(req); + ret = io_prep_fsync(req, sqe); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_prep_sfr(req); + ret = io_prep_sfr(req, sqe); break; case IORING_OP_SENDMSG: - ret = io_sendmsg_prep(req, io); + ret = io_sendmsg_prep(req, sqe); break; case IORING_OP_RECVMSG: - ret = io_recvmsg_prep(req, io); + ret = io_recvmsg_prep(req, sqe); break; case IORING_OP_CONNECT: - ret = io_connect_prep(req, io); + ret = io_connect_prep(req, sqe); break; case IORING_OP_TIMEOUT: - ret = io_timeout_prep(req, io, false); + ret = io_timeout_prep(req, sqe, false); break; case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove_prep(req); + ret = io_timeout_remove_prep(req, sqe); break; case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel_prep(req); + ret = io_async_cancel_prep(req, sqe); break; case IORING_OP_LINK_TIMEOUT: - ret = io_timeout_prep(req, io, true); + ret = io_timeout_prep(req, sqe, true); break; case IORING_OP_ACCEPT: - ret = io_accept_prep(req); + ret = io_accept_prep(req, sqe); break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -3175,7 +3118,7 @@ static int io_req_defer_prep(struct io_kiocb *req) return ret; } -static int io_req_defer(struct io_kiocb *req) +static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -3184,10 +3127,10 @@ static int io_req_defer(struct io_kiocb *req) if (!req_need_defer(req) && list_empty(&ctx->defer_list)) return 0; - if (io_alloc_async_ctx(req)) + if (!req->io && io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req); + ret = io_req_defer_prep(req, sqe); if (ret < 0) return ret; @@ -3203,9 +3146,8 @@ static int io_req_defer(struct io_kiocb *req) return -EIOCBQUEUED; } -__attribute__((nonnull)) -static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -3215,48 +3157,109 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, ret = io_nop(req); break; case IORING_OP_READV: + case IORING_OP_READ_FIXED: + if (sqe) { + ret = io_read_prep(req, sqe, force_nonblock); + if (ret < 0) + break; + } ret = io_read(req, nxt, force_nonblock); break; case IORING_OP_WRITEV: - ret = io_write(req, nxt, force_nonblock); - break; - case IORING_OP_READ_FIXED: - ret = io_read(req, nxt, force_nonblock); - break; case IORING_OP_WRITE_FIXED: + if (sqe) { + ret = io_write_prep(req, sqe, force_nonblock); + if (ret < 0) + break; + } ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_FSYNC: + if (sqe) { + ret = io_prep_fsync(req, sqe); + if (ret < 0) + break; + } ret = io_fsync(req, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: + if (sqe) { + ret = io_poll_add_prep(req, sqe); + if (ret) + break; + } ret = io_poll_add(req, nxt); break; case IORING_OP_POLL_REMOVE: + if (sqe) { + ret = io_poll_remove_prep(req, sqe); + if (ret < 0) + break; + } ret = io_poll_remove(req); break; case IORING_OP_SYNC_FILE_RANGE: + if (sqe) { + ret = io_prep_sfr(req, sqe); + if (ret < 0) + break; + } ret = io_sync_file_range(req, nxt, force_nonblock); break; case IORING_OP_SENDMSG: + if (sqe) { + ret = io_sendmsg_prep(req, sqe); + if (ret < 0) + break; + } ret = io_sendmsg(req, nxt, force_nonblock); break; case IORING_OP_RECVMSG: + if (sqe) { + ret = io_recvmsg_prep(req, sqe); + if (ret) + break; + } ret = io_recvmsg(req, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: + if (sqe) { + ret = io_timeout_prep(req, sqe, false); + if (ret) + break; + } ret = io_timeout(req); break; case IORING_OP_TIMEOUT_REMOVE: + if (sqe) { + ret = io_timeout_remove_prep(req, sqe); + if (ret) + break; + } ret = io_timeout_remove(req); break; case IORING_OP_ACCEPT: + if (sqe) { + ret = io_accept_prep(req, sqe); + if (ret) + break; + } ret = io_accept(req, nxt, force_nonblock); break; case IORING_OP_CONNECT: + if (sqe) { + ret = io_connect_prep(req, sqe); + if (ret) + break; + } ret = io_connect(req, nxt, force_nonblock); break; case IORING_OP_ASYNC_CANCEL: + if (sqe) { + ret = io_async_cancel_prep(req, sqe); + if (ret) + break; + } ret = io_async_cancel(req, nxt); break; default: @@ -3300,7 +3303,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; req->in_async = true; do { - ret = io_issue_sqe(req, &nxt, false); + ret = io_issue_sqe(req, NULL, &nxt, false); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -3366,14 +3369,15 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, return table->files[index & IORING_FILE_TABLE_MASK]; } -static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) +static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, + const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; unsigned flags; int fd, ret; - flags = READ_ONCE(req->sqe->flags); - fd = READ_ONCE(req->sqe->fd); + flags = READ_ONCE(sqe->flags); + fd = READ_ONCE(sqe->fd); if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; @@ -3505,7 +3509,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return nxt; } -static void __io_queue_sqe(struct io_kiocb *req) +static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt = NULL; @@ -3514,7 +3518,7 @@ static void __io_queue_sqe(struct io_kiocb *req) again: linked_timeout = io_prep_linked_timeout(req); - ret = io_issue_sqe(req, &nxt, true); + ret = io_issue_sqe(req, sqe, &nxt, true); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -3561,7 +3565,7 @@ static void __io_queue_sqe(struct io_kiocb *req) } } -static void io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; @@ -3571,7 +3575,7 @@ static void io_queue_sqe(struct io_kiocb *req) } req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); - ret = io_req_defer(req); + ret = io_req_defer(req, sqe); if (ret) { if (ret != -EIOCBQUEUED) { io_cqring_add_event(req, ret); @@ -3579,7 +3583,7 @@ static void io_queue_sqe(struct io_kiocb *req) io_double_put_req(req); } } else - __io_queue_sqe(req); + __io_queue_sqe(req, sqe); } static inline void io_queue_link_head(struct io_kiocb *req) @@ -3588,25 +3592,25 @@ static inline void io_queue_link_head(struct io_kiocb *req) io_cqring_add_event(req, -ECANCELED); io_double_put_req(req); } else - io_queue_sqe(req); + io_queue_sqe(req, NULL); } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ IOSQE_IO_HARDLINK) -static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, - struct io_kiocb **link) +static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_submit_state *state, struct io_kiocb **link) { struct io_ring_ctx *ctx = req->ctx; int ret; /* enforce forwards compatibility on users */ - if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { + if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; goto err_req; } - ret = io_req_set_file(state, req); + ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { err_req: io_cqring_add_event(req, ret); @@ -3624,10 +3628,10 @@ static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, if (*link) { struct io_kiocb *prev = *link; - if (req->sqe->flags & IOSQE_IO_DRAIN) + if (sqe->flags & IOSQE_IO_DRAIN) (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - if (req->sqe->flags & IOSQE_IO_HARDLINK) + if (sqe->flags & IOSQE_IO_HARDLINK) req->flags |= REQ_F_HARDLINK; if (io_alloc_async_ctx(req)) { @@ -3635,7 +3639,7 @@ static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, goto err_req; } - ret = io_req_defer_prep(req); + ret = io_req_defer_prep(req, sqe); if (ret) { /* fail even hard links since we don't submit */ prev->flags |= REQ_F_FAIL_LINK; @@ -3643,15 +3647,18 @@ static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, } trace_io_uring_link(ctx, req, prev); list_add_tail(&req->link_list, &prev->link_list); - } else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { + } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { req->flags |= REQ_F_LINK; - if (req->sqe->flags & IOSQE_IO_HARDLINK) + if (sqe->flags & IOSQE_IO_HARDLINK) req->flags |= REQ_F_HARDLINK; INIT_LIST_HEAD(&req->link_list); + ret = io_req_defer_prep(req, sqe); + if (ret) + req->flags |= REQ_F_FAIL_LINK; *link = req; } else { - io_queue_sqe(req); + io_queue_sqe(req, sqe); } return true; @@ -3696,14 +3703,15 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) } /* - * Fetch an sqe, if one is available. Note that req->sqe will point to memory + * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always * being a good citizen. If members of the sqe are validated and then later * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) +static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe **sqe_ptr) { struct io_rings *rings = ctx->rings; u32 *sq_array = ctx->sq_array; @@ -3730,9 +3738,9 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) * link list. */ req->sequence = ctx->cached_sq_head; - req->sqe = &ctx->sq_sqes[head]; - req->opcode = READ_ONCE(req->sqe->opcode); - req->user_data = READ_ONCE(req->sqe->user_data); + *sqe_ptr = &ctx->sq_sqes[head]; + req->opcode = READ_ONCE((*sqe_ptr)->opcode); + req->user_data = READ_ONCE((*sqe_ptr)->user_data); ctx->cached_sq_head++; return true; } @@ -3764,6 +3772,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } for (i = 0; i < nr; i++) { + const struct io_uring_sqe *sqe; struct io_kiocb *req; unsigned int sqe_flags; @@ -3773,7 +3782,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted = -EAGAIN; break; } - if (!io_get_sqring(ctx, req)) { + if (!io_get_sqring(ctx, req, &sqe)) { __io_free_req(req); break; } @@ -3787,7 +3796,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } submitted++; - sqe_flags = req->sqe->flags; + sqe_flags = sqe->flags; req->ring_file = ring_file; req->ring_fd = ring_fd; @@ -3795,7 +3804,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, req->in_async = async; req->needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->user_data, true, async); - if (!io_submit_sqe(req, statep, &link)) + if (!io_submit_sqe(req, sqe, statep, &link)) break; /* * If previous wasn't linked and we have a linked command, From 257bf4176362c3920250d6b972ccb36edd788487 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 9 Dec 2019 16:37:24 +0100 Subject: [PATCH 035/118] pktcdvd: fix regression on 64-bit architectures The support for the compat ioctl did not actually do what it was supposed to do because of a typo, instead it broke native support for CDROM_LAST_WRITTEN and CDROM_SEND_PACKET on all architectures with CONFIG_COMPAT enabled. Fixes: 1b114b0817cc ("pktcdvd: add compat_ioctl handler") Signed-off-by: Arnd Bergmann ---- Please apply for v5.5, I just noticed the regression while rebasing some of the patches I created on top. Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index ee67bf929fac..861fc65a1b75 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2707,7 +2707,7 @@ static const struct block_device_operations pktcdvd_ops = { .release = pkt_close, .ioctl = pkt_ioctl, #ifdef CONFIG_COMPAT - .ioctl = pkt_compat_ioctl, + .compat_ioctl = pkt_compat_ioctl, #endif .check_events = pkt_check_events, }; From 673bdf8ce0a387ef585c13b69a2676096c6edfe9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 Nov 2019 11:28:22 +0100 Subject: [PATCH 036/118] compat_ioctl: block: handle BLKREPORTZONE/BLKRESETZONE These were added to blkdev_ioctl() but not blkdev_compat_ioctl, so add them now. Cc: # v4.10+ Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls") Reviewed-by: Damien Le Moal Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- block/compat_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 6ca015f92766..830f91e05fe3 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -354,6 +354,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) * but we call blkdev_ioctl, which gets the lock for us */ case BLKRRPART: + case BLKREPORTZONE: + case BLKRESETZONE: return blkdev_ioctl(bdev, mode, cmd, (unsigned long)compat_ptr(arg)); case BLKBSZSET_32: From 21d37340912d74b1222d43c11aa9dd0687162573 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 Nov 2019 11:28:22 +0100 Subject: [PATCH 037/118] compat_ioctl: block: handle BLKGETZONESZ/BLKGETNRZONES These were added to blkdev_ioctl() in v4.20 but not blkdev_compat_ioctl, so add them now. Cc: # v4.20+ Fixes: 72cd87576d1d ("block: Introduce BLKGETZONESZ ioctl") Fixes: 65e4e3eee83d ("block: Introduce BLKGETNRZONES ioctl") Reviewed-by: Damien Le Moal Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- block/compat_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 830f91e05fe3..f5c1140b8624 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -356,6 +356,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKRRPART: case BLKREPORTZONE: case BLKRESETZONE: + case BLKGETZONESZ: + case BLKGETNRZONES: return blkdev_ioctl(bdev, mode, cmd, (unsigned long)compat_ptr(arg)); case BLKBSZSET_32: From 4b43f31d65dac587dae09ee27a48fd68a0cfd8c2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 Nov 2019 11:28:22 +0100 Subject: [PATCH 038/118] compat_ioctl: block: handle add zone open, close and finish ioctl These were added to blkdev_ioctl() in linux-5.5 but not blkdev_compat_ioctl, so add them now. Fixes: e876df1fe0ad ("block: add zone open, close and finish ioctl support") Reviewed-by: Damien Le Moal Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- block/compat_ioctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index f5c1140b8624..5b13e344229c 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -356,6 +356,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKRRPART: case BLKREPORTZONE: case BLKRESETZONE: + case BLKOPENZONE: + case BLKCLOSEZONE: + case BLKFINISHZONE: case BLKGETZONESZ: case BLKGETNRZONES: return blkdev_ioctl(bdev, mode, cmd, From b2c0fcd28772f99236d261509bcd242135677965 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 Nov 2019 11:28:22 +0100 Subject: [PATCH 039/118] compat_ioctl: block: handle Persistent Reservations These were added to blkdev_ioctl() in linux-5.5 but not blkdev_compat_ioctl, so add them now. Cc: # v4.4+ Fixes: bbd3e064362e ("block: add an API for Persistent Reservations") Signed-off-by: Arnd Bergmann Fold in followup patch from Arnd with missing pr.h header include. Signed-off-by: Jens Axboe --- block/compat_ioctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 5b13e344229c..3ed7a0f144a9 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -408,6 +409,14 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKTRACETEARDOWN: /* compatible */ ret = blk_trace_ioctl(bdev, cmd, compat_ptr(arg)); return ret; + case IOC_PR_REGISTER: + case IOC_PR_RESERVE: + case IOC_PR_RELEASE: + case IOC_PR_PREEMPT: + case IOC_PR_PREEMPT_ABORT: + case IOC_PR_CLEAR: + return blkdev_ioctl(bdev, mode, cmd, + (unsigned long)compat_ptr(arg)); default: if (disk->fops->compat_ioctl) ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg); From 46cf053efec6a3a5f343fead837777efe8252a46 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 Dec 2019 17:02:23 -0800 Subject: [PATCH 040/118] Linux 5.5-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e51b53c180bc..caf14acf1953 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 5 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Kleptomaniac Octopus # *DOCUMENTATION* From f54c7898ed1c3c9331376c0337a5049c38f66497 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 22 Dec 2019 23:37:40 +0100 Subject: [PATCH 041/118] bpf: Fix precision tracking for unbounded scalars Anatoly has been fuzzing with kBdysch harness and reported a hang in one of the outcomes. Upon closer analysis, it turns out that precise scalar value tracking is missing a few precision markings for unknown scalars: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = 0 1: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (35) if r0 >= 0xf72e goto pc+0 --> only follow fallthrough 2: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 2: (35) if r0 >= 0x80fe0000 goto pc+0 --> only follow fallthrough 3: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 3: (14) w0 -= -536870912 4: R0_w=invP536870912 R1=ctx(id=0,off=0,imm=0) R10=fp0 4: (0f) r1 += r0 5: R0_w=invP536870912 R1_w=inv(id=0) R10=fp0 5: (55) if r1 != 0x104c1500 goto pc+0 --> push other branch for later analysis R0_w=invP536870912 R1_w=inv273421568 R10=fp0 6: R0_w=invP536870912 R1_w=inv273421568 R10=fp0 6: (b7) r0 = 0 7: R0=invP0 R1=inv273421568 R10=fp0 7: (76) if w1 s>= 0xffffff00 goto pc+3 --> only follow goto 11: R0=invP0 R1=inv273421568 R10=fp0 11: (95) exit 6: R0_w=invP536870912 R1_w=inv(id=0) R10=fp0 6: (b7) r0 = 0 propagating r0 7: safe processed 11 insns [...] In the analysis of the second path coming after the successful exit above, the path is being pruned at line 7. Pruning analysis found that both r0 are precise P0 and both R1 are non-precise scalars and given prior path with R1 as non-precise scalar succeeded, this one is therefore safe as well. However, problem is that given condition at insn 7 in the first run, we only followed goto and didn't push the other branch for later analysis, we've never walked the few insns in there and therefore dead-code sanitation rewrites it as goto pc-1, causing the hang depending on the skb address hitting these conditions. The issue is that R1 should have been marked as precise as well such that pruning enforces range check and conluded that new R1 is not in range of old R1. In insn 4, we mark R1 (skb) as unknown scalar via __mark_reg_unbounded() but not mark_reg_unbounded() and therefore regs->precise remains as false. Back in b5dc0163d8fd ("bpf: precise scalar_value tracking"), this was not the case since marking out of __mark_reg_unbounded() had this covered as well. Once in both are set as precise in 4 as they should have been, we conclude that given R1 was in prior fall-through path 0x104c1500 and now is completely unknown, the check at insn 7 concludes that we need to continue walking. Analysis after the fix: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = 0 1: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (35) if r0 >= 0xf72e goto pc+0 2: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 2: (35) if r0 >= 0x80fe0000 goto pc+0 3: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 3: (14) w0 -= -536870912 4: R0_w=invP536870912 R1=ctx(id=0,off=0,imm=0) R10=fp0 4: (0f) r1 += r0 5: R0_w=invP536870912 R1_w=invP(id=0) R10=fp0 5: (55) if r1 != 0x104c1500 goto pc+0 R0_w=invP536870912 R1_w=invP273421568 R10=fp0 6: R0_w=invP536870912 R1_w=invP273421568 R10=fp0 6: (b7) r0 = 0 7: R0=invP0 R1=invP273421568 R10=fp0 7: (76) if w1 s>= 0xffffff00 goto pc+3 11: R0=invP0 R1=invP273421568 R10=fp0 11: (95) exit 6: R0_w=invP536870912 R1_w=invP(id=0) R10=fp0 6: (b7) r0 = 0 7: R0_w=invP0 R1_w=invP(id=0) R10=fp0 7: (76) if w1 s>= 0xffffff00 goto pc+3 R0_w=invP0 R1_w=invP(id=0) R10=fp0 8: R0_w=invP0 R1_w=invP(id=0) R10=fp0 8: (a5) if r0 < 0x2007002a goto pc+0 9: R0_w=invP0 R1_w=invP(id=0) R10=fp0 9: (57) r0 &= -16316416 10: R0_w=invP0 R1_w=invP(id=0) R10=fp0 10: (a6) if w0 < 0x1201 goto pc+0 11: R0_w=invP0 R1_w=invP(id=0) R10=fp0 11: (95) exit 11: R0=invP0 R1=invP(id=0) R10=fp0 11: (95) exit processed 16 insns [...] Fixes: 6754172c208d ("bpf: fix precision tracking in presence of bpf2bpf calls") Reported-by: Anatoly Trosinenko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191222223740.25297-1-daniel@iogearbox.net --- kernel/bpf/verifier.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4983940cbdca..6f63ae7a370c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -907,7 +907,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void __mark_reg_not_init(struct bpf_reg_state *reg); +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); /* Mark the unknown part of a register (variable offset or scalar value) as * known to have the value @imm. @@ -945,7 +946,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); return; } __mark_reg_known_zero(regs + regno); @@ -1054,7 +1055,8 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) } /* Mark a register as having a completely unknown (scalar) value. */ -static void __mark_reg_unknown(struct bpf_reg_state *reg) +static void __mark_reg_unknown(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { /* * Clear type, id, off, and union(map_ptr, range) and @@ -1064,6 +1066,8 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; + reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? + true : false; __mark_reg_unbounded(reg); } @@ -1074,19 +1078,16 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, verbose(env, "mark_reg_unknown(regs, %u)\n", regno); /* Something bad happened, let's kill all regs except FP */ for (regno = 0; regno < BPF_REG_FP; regno++) - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); return; } - regs += regno; - __mark_reg_unknown(regs); - /* constant backtracking is enabled for root without bpf2bpf calls */ - regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? - true : false; + __mark_reg_unknown(env, regs + regno); } -static void __mark_reg_not_init(struct bpf_reg_state *reg) +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { - __mark_reg_unknown(reg); + __mark_reg_unknown(env, reg); reg->type = NOT_INIT; } @@ -1097,10 +1098,10 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, verbose(env, "mark_reg_not_init(regs, %u)\n", regno); /* Something bad happened, let's kill all regs except FP */ for (regno = 0; regno < BPF_REG_FP; regno++) - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); return; } - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); } #define DEF_NOT_SUBREG (0) @@ -3234,7 +3235,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } if (state->stack[spi].slot_type[0] == STACK_SPILL && state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { - __mark_reg_unknown(&state->stack[spi].spilled_ptr); + __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) state->stack[spi].slot_type[j] = STACK_MISC; goto mark; @@ -3892,7 +3893,7 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, if (!reg) continue; if (reg_is_pkt_pointer_any(reg)) - __mark_reg_unknown(reg); + __mark_reg_unknown(env, reg); } } @@ -3920,7 +3921,7 @@ static void release_reg_references(struct bpf_verifier_env *env, if (!reg) continue; if (reg->ref_obj_id == ref_obj_id) - __mark_reg_unknown(reg); + __mark_reg_unknown(env, reg); } } @@ -4582,7 +4583,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* Taint dst register if offset had invalid bounds derived from * e.g. dead branches. */ - __mark_reg_unknown(dst_reg); + __mark_reg_unknown(env, dst_reg); return 0; } @@ -4834,13 +4835,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Taint dst register if offset had invalid bounds derived from * e.g. dead branches. */ - __mark_reg_unknown(dst_reg); + __mark_reg_unknown(env, dst_reg); return 0; } if (!src_known && opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { - __mark_reg_unknown(dst_reg); + __mark_reg_unknown(env, dst_reg); return 0; } @@ -6982,7 +6983,7 @@ static void clean_func_state(struct bpf_verifier_env *env, /* since the register is unused, clear its state * to make further comparison simpler */ - __mark_reg_not_init(&st->regs[i]); + __mark_reg_not_init(env, &st->regs[i]); } for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { @@ -6990,7 +6991,7 @@ static void clean_func_state(struct bpf_verifier_env *env, /* liveness must not touch this stack slot anymore */ st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; if (!(live & REG_LIVE_READ)) { - __mark_reg_not_init(&st->stack[i].spilled_ptr); + __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; } From 996fa477d118a11e63f77fbb393173560e5a655d Mon Sep 17 00:00:00 2001 From: Leonard Crestez Date: Mon, 11 Nov 2019 20:16:10 +0200 Subject: [PATCH 042/118] PM / devfreq: Drop explicit selection of PM_OPP CONFIG_PM_OPP is already selected by CONFIG_PM_DEVFREQ since commit b9c69e043266 ("PM / devfreq: Add dependency on PM_OPP"). This means that individual drivers shouldn't "select PM_OPP" explicitly. Signed-off-by: Leonard Crestez [cw00.choi: Edit the patch title] Signed-off-by: Chanwoo Choi --- drivers/devfreq/Kconfig | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig index defe1d438710..066e6c4efaa2 100644 --- a/drivers/devfreq/Kconfig +++ b/drivers/devfreq/Kconfig @@ -83,7 +83,6 @@ config ARM_EXYNOS_BUS_DEVFREQ select DEVFREQ_GOV_PASSIVE select DEVFREQ_EVENT_EXYNOS_PPMU select PM_DEVFREQ_EVENT - select PM_OPP help This adds the common DEVFREQ driver for Exynos Memory bus. Exynos Memory bus has one more group of memory bus (e.g, MIF and INT block). @@ -98,7 +97,6 @@ config ARM_TEGRA_DEVFREQ ARCH_TEGRA_132_SOC || ARCH_TEGRA_124_SOC || \ ARCH_TEGRA_210_SOC || \ COMPILE_TEST - select PM_OPP help This adds the DEVFREQ driver for the Tegra family of SoCs. It reads ACTMON counters of memory controllers and adjusts the @@ -109,7 +107,6 @@ config ARM_TEGRA20_DEVFREQ depends on (TEGRA_MC && TEGRA20_EMC) || COMPILE_TEST depends on COMMON_CLK select DEVFREQ_GOV_SIMPLE_ONDEMAND - select PM_OPP help This adds the DEVFREQ driver for the Tegra20 family of SoCs. It reads Memory Controller counters and adjusts the operating @@ -121,7 +118,6 @@ config ARM_RK3399_DMC_DEVFREQ select DEVFREQ_EVENT_ROCKCHIP_DFI select DEVFREQ_GOV_SIMPLE_ONDEMAND select PM_DEVFREQ_EVENT - select PM_OPP help This adds the DEVFREQ driver for the RK3399 DMC(Dynamic Memory Controller). It sets the frequency for the memory controller and reads the usage counts From 5fdb0684b5b0f41402161f068d3d84bf6ed1c3f4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 12 Dec 2019 10:56:31 +0900 Subject: [PATCH 043/118] PM / devfreq: tegra: Add COMMON_CLK dependency Compile-testing this driver fails if CONFIG_COMMON_CLK is not set: drivers/devfreq/tegra30-devfreq.o: In function `tegra_devfreq_target': tegra30-devfreq.c:(.text+0x164): undefined reference to `clk_set_min_rate' Fixes: 35f8dbc72721 ("PM / devfreq: tegra: Enable COMPILE_TEST for the driver") Signed-off-by: Arnd Bergmann Reviewed-by: Dmitry Osipenko Signed-off-by: Chanwoo Choi --- drivers/devfreq/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig index 066e6c4efaa2..35535833b6f7 100644 --- a/drivers/devfreq/Kconfig +++ b/drivers/devfreq/Kconfig @@ -97,6 +97,7 @@ config ARM_TEGRA_DEVFREQ ARCH_TEGRA_132_SOC || ARCH_TEGRA_124_SOC || \ ARCH_TEGRA_210_SOC || \ COMPILE_TEST + depends on COMMON_CLK help This adds the DEVFREQ driver for the Tegra family of SoCs. It reads ACTMON counters of memory controllers and adjusts the From 74ac57806af124b57af30f2ca9fc9df0afe6269e Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 17 Dec 2019 14:20:57 +0000 Subject: [PATCH 044/118] drm/i915/pmu: Ensure monotonic rc6 Avoid rc6 counter going backward in close to 0% RC6 scenarios like: 15.005477996 114,246,613 ns i915/rc6-residency/ 16.005876662 667,657 ns i915/rc6-residency/ 17.006131417 7,286 ns i915/rc6-residency/ 18.006615031 18,446,744,073,708,914,688 ns i915/rc6-residency/ 19.007158361 18,446,744,073,709,447,168 ns i915/rc6-residency/ 20.007806498 0 ns i915/rc6-residency/ 21.008227495 1,440,403 ns i915/rc6-residency/ There are two aspects to this fix. First is not assuming rc6 value zero means GT is asleep since that can also mean GPU is fully busy and we do not want to enter the estimation path in that case. Second is ensuring monotonicity on the estimation path itself. I suspect what is happening is with extremely rapid park/unpark cycles we get no updates on the real rc6 and therefore have to careful not to unconditionally trust use last known real rc6 when creating a new estimation. v2: * Simplify logic by not tracking the estimate but last reported value. Signed-off-by: Tvrtko Ursulin Fixes: 16ffe73c186b ("drm/i915/pmu: Use GT parked for estimating RC6 while asleep") Cc: Chris Wilson Reviewed-by: Chris Wilson # v1 Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20191217142057.1000-1-tvrtko.ursulin@linux.intel.com (cherry picked from commit df6a42053513846475ae1fbd224dfbdbcd0c7010) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_pmu.c | 73 +++++++++------------------------ drivers/gpu/drm/i915/i915_pmu.h | 2 +- 2 files changed, 21 insertions(+), 54 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 2814218c5ba1..6f09aa0be80a 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -144,61 +144,40 @@ static inline s64 ktime_since(const ktime_t kt) return ktime_to_ns(ktime_sub(ktime_get(), kt)); } -static u64 __pmu_estimate_rc6(struct i915_pmu *pmu) -{ - u64 val; - - /* - * We think we are runtime suspended. - * - * Report the delta from when the device was suspended to now, - * on top of the last known real value, as the approximated RC6 - * counter value. - */ - val = ktime_since(pmu->sleep_last); - val += pmu->sample[__I915_SAMPLE_RC6].cur; - - pmu->sample[__I915_SAMPLE_RC6_ESTIMATED].cur = val; - - return val; -} - -static u64 __pmu_update_rc6(struct i915_pmu *pmu, u64 val) -{ - /* - * If we are coming back from being runtime suspended we must - * be careful not to report a larger value than returned - * previously. - */ - if (val >= pmu->sample[__I915_SAMPLE_RC6_ESTIMATED].cur) { - pmu->sample[__I915_SAMPLE_RC6_ESTIMATED].cur = 0; - pmu->sample[__I915_SAMPLE_RC6].cur = val; - } else { - val = pmu->sample[__I915_SAMPLE_RC6_ESTIMATED].cur; - } - - return val; -} - static u64 get_rc6(struct intel_gt *gt) { struct drm_i915_private *i915 = gt->i915; struct i915_pmu *pmu = &i915->pmu; unsigned long flags; + bool awake = false; u64 val; - val = 0; if (intel_gt_pm_get_if_awake(gt)) { val = __get_rc6(gt); intel_gt_pm_put_async(gt); + awake = true; } spin_lock_irqsave(&pmu->lock, flags); - if (val) - val = __pmu_update_rc6(pmu, val); + if (awake) { + pmu->sample[__I915_SAMPLE_RC6].cur = val; + } else { + /* + * We think we are runtime suspended. + * + * Report the delta from when the device was suspended to now, + * on top of the last known real value, as the approximated RC6 + * counter value. + */ + val = ktime_since(pmu->sleep_last); + val += pmu->sample[__I915_SAMPLE_RC6].cur; + } + + if (val < pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur) + val = pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur; else - val = __pmu_estimate_rc6(pmu); + pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur = val; spin_unlock_irqrestore(&pmu->lock, flags); @@ -210,20 +189,11 @@ static void park_rc6(struct drm_i915_private *i915) struct i915_pmu *pmu = &i915->pmu; if (pmu->enable & config_enabled_mask(I915_PMU_RC6_RESIDENCY)) - __pmu_update_rc6(pmu, __get_rc6(&i915->gt)); + pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(&i915->gt); pmu->sleep_last = ktime_get(); } -static void unpark_rc6(struct drm_i915_private *i915) -{ - struct i915_pmu *pmu = &i915->pmu; - - /* Estimate how long we slept and accumulate that into rc6 counters */ - if (pmu->enable & config_enabled_mask(I915_PMU_RC6_RESIDENCY)) - __pmu_estimate_rc6(pmu); -} - #else static u64 get_rc6(struct intel_gt *gt) @@ -232,7 +202,6 @@ static u64 get_rc6(struct intel_gt *gt) } static void park_rc6(struct drm_i915_private *i915) {} -static void unpark_rc6(struct drm_i915_private *i915) {} #endif @@ -281,8 +250,6 @@ void i915_pmu_gt_unparked(struct drm_i915_private *i915) */ __i915_pmu_maybe_start_timer(pmu); - unpark_rc6(i915); - spin_unlock_irq(&pmu->lock); } diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h index bf52e3983631..6c1647c5daf2 100644 --- a/drivers/gpu/drm/i915/i915_pmu.h +++ b/drivers/gpu/drm/i915/i915_pmu.h @@ -18,7 +18,7 @@ enum { __I915_SAMPLE_FREQ_ACT = 0, __I915_SAMPLE_FREQ_REQ, __I915_SAMPLE_RC6, - __I915_SAMPLE_RC6_ESTIMATED, + __I915_SAMPLE_RC6_LAST_REPORTED, __I915_NUM_PMU_SAMPLERS }; From d39da686c2f3b8518b6933951d53891aa83227d6 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 18 Dec 2019 09:35:04 +0000 Subject: [PATCH 045/118] drm/i915/gt: Ratelimit display power w/a For very light workloads that frequently park, acquiring the display power well (required to prevent the dmc from trashing the system) takes longer than the execution. A good example is the igt_coherency selftest, which is slowed down by an order of magnitude in the worst case with powerwell cycling. To prevent frequent cycling, while keeping our fast soft-rc6, use a timer to delay release of the display powerwell. Fixes: 311770173fac ("drm/i915/gt: Schedule request retirement when timeline idles") References: https://gitlab.freedesktop.org/drm/intel/issues/848 Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20191218093504.3477048-1-chris@chris-wilson.co.uk (cherry picked from commit 81ff52b705775433a955b2746d37b87bdc89a3d0) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_gt_pm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c index a459a42ad5c2..7e64b7d7d330 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c @@ -94,8 +94,9 @@ static int __gt_park(struct intel_wakeref *wf) intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL); } + /* Defer dropping the display power well for 100ms, it's slow! */ GEM_BUG_ON(!wakeref); - intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ, wakeref); + intel_display_power_put_async(i915, POWER_DOMAIN_GT_IRQ, wakeref); i915_globals_park(); From e85ade1f50aae464ce196672faa7a099fd1721ed Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 18 Dec 2019 10:40:43 +0000 Subject: [PATCH 046/118] drm/i915: Hold reference to intel_frontbuffer as we track activity Since obj->frontbuffer is no longer protected by the struct_mutex, as we are processing the execbuf, it may be removed. Mark the intel_frontbuffer as rcu protected, and so acquire a reference to the struct as we track activity upon it. Closes: https://gitlab.freedesktop.org/drm/intel/issues/827 Fixes: 8e7cb1799b4f ("drm/i915: Extract intel_frontbuffer active tracking") Signed-off-by: Chris Wilson Cc: Matthew Auld Cc: # v5.4+ Reviewed-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20191218104043.3539458-1-chris@chris-wilson.co.uk (cherry picked from commit da42104f589d979bbe402703fd836cec60befae1) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_display.c | 2 +- .../gpu/drm/i915/display/intel_frontbuffer.c | 16 ++++----- .../gpu/drm/i915/display/intel_frontbuffer.h | 34 +++++++++++++++++-- drivers/gpu/drm/i915/display/intel_overlay.c | 17 +++++++--- drivers/gpu/drm/i915/gem/i915_gem_clflush.c | 3 +- drivers/gpu/drm/i915/gem/i915_gem_domain.c | 4 +-- drivers/gpu/drm/i915/gem/i915_gem_object.c | 26 +++++++++++++- drivers/gpu/drm/i915/gem/i915_gem_object.h | 23 ++++++++++++- .../gpu/drm/i915/gem/i915_gem_object_types.h | 2 +- drivers/gpu/drm/i915/i915_gem.c | 10 +++--- drivers/gpu/drm/i915/i915_vma.c | 10 ++++-- 11 files changed, 116 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 6f5e3bd13ad1..effc4250b230 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -15112,7 +15112,7 @@ intel_prepare_plane_fb(struct drm_plane *plane, return ret; fb_obj_bump_render_priority(obj); - intel_frontbuffer_flush(obj->frontbuffer, ORIGIN_DIRTYFB); + i915_gem_object_flush_frontbuffer(obj, ORIGIN_DIRTYFB); if (!new_plane_state->base.fence) { /* implicit fencing */ struct dma_fence *fence; diff --git a/drivers/gpu/drm/i915/display/intel_frontbuffer.c b/drivers/gpu/drm/i915/display/intel_frontbuffer.c index 84b164f31895..6cb02c912acc 100644 --- a/drivers/gpu/drm/i915/display/intel_frontbuffer.c +++ b/drivers/gpu/drm/i915/display/intel_frontbuffer.c @@ -229,11 +229,11 @@ static void frontbuffer_release(struct kref *ref) vma->display_alignment = I915_GTT_MIN_ALIGNMENT; spin_unlock(&obj->vma.lock); - obj->frontbuffer = NULL; + RCU_INIT_POINTER(obj->frontbuffer, NULL); spin_unlock(&to_i915(obj->base.dev)->fb_tracking.lock); i915_gem_object_put(obj); - kfree(front); + kfree_rcu(front, rcu); } struct intel_frontbuffer * @@ -242,11 +242,7 @@ intel_frontbuffer_get(struct drm_i915_gem_object *obj) struct drm_i915_private *i915 = to_i915(obj->base.dev); struct intel_frontbuffer *front; - spin_lock(&i915->fb_tracking.lock); - front = obj->frontbuffer; - if (front) - kref_get(&front->ref); - spin_unlock(&i915->fb_tracking.lock); + front = __intel_frontbuffer_get(obj); if (front) return front; @@ -262,13 +258,13 @@ intel_frontbuffer_get(struct drm_i915_gem_object *obj) i915_active_may_sleep(frontbuffer_retire)); spin_lock(&i915->fb_tracking.lock); - if (obj->frontbuffer) { + if (rcu_access_pointer(obj->frontbuffer)) { kfree(front); - front = obj->frontbuffer; + front = rcu_dereference_protected(obj->frontbuffer, true); kref_get(&front->ref); } else { i915_gem_object_get(obj); - obj->frontbuffer = front; + rcu_assign_pointer(obj->frontbuffer, front); } spin_unlock(&i915->fb_tracking.lock); diff --git a/drivers/gpu/drm/i915/display/intel_frontbuffer.h b/drivers/gpu/drm/i915/display/intel_frontbuffer.h index adc64d61a4a5..6d41f5394425 100644 --- a/drivers/gpu/drm/i915/display/intel_frontbuffer.h +++ b/drivers/gpu/drm/i915/display/intel_frontbuffer.h @@ -27,10 +27,10 @@ #include #include +#include "gem/i915_gem_object_types.h" #include "i915_active.h" struct drm_i915_private; -struct drm_i915_gem_object; enum fb_op_origin { ORIGIN_GTT, @@ -45,6 +45,7 @@ struct intel_frontbuffer { atomic_t bits; struct i915_active write; struct drm_i915_gem_object *obj; + struct rcu_head rcu; }; void intel_frontbuffer_flip_prepare(struct drm_i915_private *i915, @@ -54,6 +55,35 @@ void intel_frontbuffer_flip_complete(struct drm_i915_private *i915, void intel_frontbuffer_flip(struct drm_i915_private *i915, unsigned frontbuffer_bits); +void intel_frontbuffer_put(struct intel_frontbuffer *front); + +static inline struct intel_frontbuffer * +__intel_frontbuffer_get(const struct drm_i915_gem_object *obj) +{ + struct intel_frontbuffer *front; + + if (likely(!rcu_access_pointer(obj->frontbuffer))) + return NULL; + + rcu_read_lock(); + do { + front = rcu_dereference(obj->frontbuffer); + if (!front) + break; + + if (unlikely(!kref_get_unless_zero(&front->ref))) + continue; + + if (likely(front == rcu_access_pointer(obj->frontbuffer))) + break; + + intel_frontbuffer_put(front); + } while (1); + rcu_read_unlock(); + + return front; +} + struct intel_frontbuffer * intel_frontbuffer_get(struct drm_i915_gem_object *obj); @@ -119,6 +149,4 @@ void intel_frontbuffer_track(struct intel_frontbuffer *old, struct intel_frontbuffer *new, unsigned int frontbuffer_bits); -void intel_frontbuffer_put(struct intel_frontbuffer *front); - #endif /* __INTEL_FRONTBUFFER_H__ */ diff --git a/drivers/gpu/drm/i915/display/intel_overlay.c b/drivers/gpu/drm/i915/display/intel_overlay.c index 848ce07a8ec2..8a98a1aa7adc 100644 --- a/drivers/gpu/drm/i915/display/intel_overlay.c +++ b/drivers/gpu/drm/i915/display/intel_overlay.c @@ -279,12 +279,21 @@ static void intel_overlay_flip_prepare(struct intel_overlay *overlay, struct i915_vma *vma) { enum pipe pipe = overlay->crtc->pipe; + struct intel_frontbuffer *from = NULL, *to = NULL; WARN_ON(overlay->old_vma); - intel_frontbuffer_track(overlay->vma ? overlay->vma->obj->frontbuffer : NULL, - vma ? vma->obj->frontbuffer : NULL, - INTEL_FRONTBUFFER_OVERLAY(pipe)); + if (overlay->vma) + from = intel_frontbuffer_get(overlay->vma->obj); + if (vma) + to = intel_frontbuffer_get(vma->obj); + + intel_frontbuffer_track(from, to, INTEL_FRONTBUFFER_OVERLAY(pipe)); + + if (to) + intel_frontbuffer_put(to); + if (from) + intel_frontbuffer_put(from); intel_frontbuffer_flip_prepare(overlay->i915, INTEL_FRONTBUFFER_OVERLAY(pipe)); @@ -766,7 +775,7 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay, ret = PTR_ERR(vma); goto out_pin_section; } - intel_frontbuffer_flush(new_bo->frontbuffer, ORIGIN_DIRTYFB); + i915_gem_object_flush_frontbuffer(new_bo, ORIGIN_DIRTYFB); if (!overlay->active) { u32 oconfig; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_clflush.c b/drivers/gpu/drm/i915/gem/i915_gem_clflush.c index b9f504ba3b32..18ee708585a9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_clflush.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_clflush.c @@ -20,7 +20,8 @@ static void __do_clflush(struct drm_i915_gem_object *obj) { GEM_BUG_ON(!i915_gem_object_has_pages(obj)); drm_clflush_sg(obj->mm.pages); - intel_frontbuffer_flush(obj->frontbuffer, ORIGIN_CPU); + + i915_gem_object_flush_frontbuffer(obj, ORIGIN_CPU); } static int clflush_work(struct dma_fence_work *base) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index 9937b4c341f1..f86400a191b0 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -664,7 +664,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, i915_gem_object_unlock(obj); if (write_domain) - intel_frontbuffer_invalidate(obj->frontbuffer, ORIGIN_CPU); + i915_gem_object_invalidate_frontbuffer(obj, ORIGIN_CPU); out_unpin: i915_gem_object_unpin_pages(obj); @@ -784,7 +784,7 @@ int i915_gem_object_prepare_write(struct drm_i915_gem_object *obj, } out: - intel_frontbuffer_invalidate(obj->frontbuffer, ORIGIN_CPU); + i915_gem_object_invalidate_frontbuffer(obj, ORIGIN_CPU); obj->mm.dirty = true; /* return with the pages pinned */ return 0; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c index a50296cce0d8..a596548c07bf 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c @@ -280,7 +280,7 @@ i915_gem_object_flush_write_domain(struct drm_i915_gem_object *obj, for_each_ggtt_vma(vma, obj) intel_gt_flush_ggtt_writes(vma->vm->gt); - intel_frontbuffer_flush(obj->frontbuffer, ORIGIN_CPU); + i915_gem_object_flush_frontbuffer(obj, ORIGIN_CPU); for_each_ggtt_vma(vma, obj) { if (vma->iomap) @@ -308,6 +308,30 @@ i915_gem_object_flush_write_domain(struct drm_i915_gem_object *obj, obj->write_domain = 0; } +void __i915_gem_object_flush_frontbuffer(struct drm_i915_gem_object *obj, + enum fb_op_origin origin) +{ + struct intel_frontbuffer *front; + + front = __intel_frontbuffer_get(obj); + if (front) { + intel_frontbuffer_flush(front, origin); + intel_frontbuffer_put(front); + } +} + +void __i915_gem_object_invalidate_frontbuffer(struct drm_i915_gem_object *obj, + enum fb_op_origin origin) +{ + struct intel_frontbuffer *front; + + front = __intel_frontbuffer_get(obj); + if (front) { + intel_frontbuffer_invalidate(front, origin); + intel_frontbuffer_put(front); + } +} + void i915_gem_init__objects(struct drm_i915_private *i915) { INIT_WORK(&i915->mm.free_work, __i915_gem_free_work); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h index 458cd51331f1..4b93591fd5c7 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h @@ -13,8 +13,8 @@ #include +#include "display/intel_frontbuffer.h" #include "i915_gem_object_types.h" - #include "i915_gem_gtt.h" void i915_gem_init__objects(struct drm_i915_private *i915); @@ -463,4 +463,25 @@ int i915_gem_object_wait_priority(struct drm_i915_gem_object *obj, unsigned int flags, const struct i915_sched_attr *attr); +void __i915_gem_object_flush_frontbuffer(struct drm_i915_gem_object *obj, + enum fb_op_origin origin); +void __i915_gem_object_invalidate_frontbuffer(struct drm_i915_gem_object *obj, + enum fb_op_origin origin); + +static inline void +i915_gem_object_flush_frontbuffer(struct drm_i915_gem_object *obj, + enum fb_op_origin origin) +{ + if (unlikely(rcu_access_pointer(obj->frontbuffer))) + __i915_gem_object_flush_frontbuffer(obj, origin); +} + +static inline void +i915_gem_object_invalidate_frontbuffer(struct drm_i915_gem_object *obj, + enum fb_op_origin origin) +{ + if (unlikely(rcu_access_pointer(obj->frontbuffer))) + __i915_gem_object_invalidate_frontbuffer(obj, origin); +} + #endif diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 96008374a412..e3f3944fbd90 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -150,7 +150,7 @@ struct drm_i915_gem_object { */ u16 write_domain; - struct intel_frontbuffer *frontbuffer; + struct intel_frontbuffer __rcu *frontbuffer; /** Current tiling stride for the object, if it's tiled. */ unsigned int tiling_and_stride; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d034fa413164..905890e3ac24 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -161,7 +161,7 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj, * We manually control the domain here and pretend that it * remains coherent i.e. in the GTT domain, like shmem_pwrite. */ - intel_frontbuffer_invalidate(obj->frontbuffer, ORIGIN_CPU); + i915_gem_object_invalidate_frontbuffer(obj, ORIGIN_CPU); if (copy_from_user(vaddr, user_data, args->size)) return -EFAULT; @@ -169,7 +169,7 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj, drm_clflush_virt_range(vaddr, args->size); intel_gt_chipset_flush(&to_i915(obj->base.dev)->gt); - intel_frontbuffer_flush(obj->frontbuffer, ORIGIN_CPU); + i915_gem_object_flush_frontbuffer(obj, ORIGIN_CPU); return 0; } @@ -589,7 +589,7 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj, goto out_unpin; } - intel_frontbuffer_invalidate(obj->frontbuffer, ORIGIN_CPU); + i915_gem_object_invalidate_frontbuffer(obj, ORIGIN_CPU); user_data = u64_to_user_ptr(args->data_ptr); offset = args->offset; @@ -631,7 +631,7 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj, user_data += page_length; offset += page_length; } - intel_frontbuffer_flush(obj->frontbuffer, ORIGIN_CPU); + i915_gem_object_flush_frontbuffer(obj, ORIGIN_CPU); i915_gem_object_unlock_fence(obj, fence); out_unpin: @@ -721,7 +721,7 @@ i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj, offset = 0; } - intel_frontbuffer_flush(obj->frontbuffer, ORIGIN_CPU); + i915_gem_object_flush_frontbuffer(obj, ORIGIN_CPU); i915_gem_object_unlock_fence(obj, fence); return ret; diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index e5512f26e20a..01c822256b39 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -1104,8 +1104,14 @@ int i915_vma_move_to_active(struct i915_vma *vma, return err; if (flags & EXEC_OBJECT_WRITE) { - if (intel_frontbuffer_invalidate(obj->frontbuffer, ORIGIN_CS)) - i915_active_add_request(&obj->frontbuffer->write, rq); + struct intel_frontbuffer *front; + + front = __intel_frontbuffer_get(obj); + if (unlikely(front)) { + if (intel_frontbuffer_invalidate(front, ORIGIN_CS)) + i915_active_add_request(&front->write, rq); + intel_frontbuffer_put(front); + } dma_resv_add_excl_fence(vma->resv, &rq->fence); obj->write_domain = I915_GEM_DOMAIN_RENDER; From fa633a0f89192379828103957874682d389eae83 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 23 Dec 2019 15:13:26 +0900 Subject: [PATCH 047/118] libbpf: Fix build on read-only filesystems I got the following error when I tried to build perf on a read-only filesystem with O=dir option. $ cd /some/where/ro/linux/tools/perf $ make O=$HOME/build/perf ... CC /home/namhyung/build/perf/lib.o /bin/sh: bpf_helper_defs.h: Read-only file system make[3]: *** [Makefile:184: bpf_helper_defs.h] Error 1 make[2]: *** [Makefile.perf:778: /home/namhyung/build/perf/libbpf.a] Error 2 make[2]: *** Waiting for unfinished jobs.... LD /home/namhyung/build/perf/libperf-in.o AR /home/namhyung/build/perf/libperf.a PERF_VERSION = 5.4.0 make[1]: *** [Makefile.perf:225: sub-make] Error 2 make: *** [Makefile:70: all] Error 2 It was becaused bpf_helper_defs.h was generated in current directory. Move it to OUTPUT directory. Signed-off-by: Namhyung Kim Signed-off-by: Daniel Borkmann Tested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191223061326.843366-1-namhyung@kernel.org --- tools/lib/bpf/Makefile | 15 ++++++++------- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 6 +++--- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index defae23a0169..97830e46d1a0 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -138,6 +138,7 @@ STATIC_OBJDIR := $(OUTPUT)staticobjs/ BPF_IN_SHARED := $(SHARED_OBJDIR)libbpf-in.o BPF_IN_STATIC := $(STATIC_OBJDIR)libbpf-in.o VERSION_SCRIPT := libbpf.map +BPF_HELPER_DEFS := $(OUTPUT)bpf_helper_defs.h LIB_TARGET := $(addprefix $(OUTPUT),$(LIB_TARGET)) LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE)) @@ -159,7 +160,7 @@ all: fixdep all_cmd: $(CMD_TARGETS) check -$(BPF_IN_SHARED): force elfdep bpfdep bpf_helper_defs.h +$(BPF_IN_SHARED): force elfdep bpfdep $(BPF_HELPER_DEFS) @(test -f ../../include/uapi/linux/bpf.h -a -f ../../../include/uapi/linux/bpf.h && ( \ (diff -B ../../include/uapi/linux/bpf.h ../../../include/uapi/linux/bpf.h >/dev/null) || \ echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from latest version at 'include/uapi/linux/bpf.h'" >&2 )) || true @@ -177,12 +178,12 @@ $(BPF_IN_SHARED): force elfdep bpfdep bpf_helper_defs.h echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(SHARED_OBJDIR) CFLAGS="$(CFLAGS) $(SHLIB_FLAGS)" -$(BPF_IN_STATIC): force elfdep bpfdep bpf_helper_defs.h +$(BPF_IN_STATIC): force elfdep bpfdep $(BPF_HELPER_DEFS) $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) -bpf_helper_defs.h: $(srctree)/tools/include/uapi/linux/bpf.h +$(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h $(Q)$(srctree)/scripts/bpf_helpers_doc.py --header \ - --file $(srctree)/tools/include/uapi/linux/bpf.h > bpf_helper_defs.h + --file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS) $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) @@ -243,7 +244,7 @@ install_lib: all_cmd $(call do_install_mkdir,$(libdir_SQ)); \ cp -fpR $(LIB_FILE) $(DESTDIR)$(libdir_SQ) -install_headers: bpf_helper_defs.h +install_headers: $(BPF_HELPER_DEFS) $(call QUIET_INSTALL, headers) \ $(call do_install,bpf.h,$(prefix)/include/bpf,644); \ $(call do_install,libbpf.h,$(prefix)/include/bpf,644); \ @@ -251,7 +252,7 @@ install_headers: bpf_helper_defs.h $(call do_install,libbpf_util.h,$(prefix)/include/bpf,644); \ $(call do_install,xsk.h,$(prefix)/include/bpf,644); \ $(call do_install,bpf_helpers.h,$(prefix)/include/bpf,644); \ - $(call do_install,bpf_helper_defs.h,$(prefix)/include/bpf,644); \ + $(call do_install,$(BPF_HELPER_DEFS),$(prefix)/include/bpf,644); \ $(call do_install,bpf_tracing.h,$(prefix)/include/bpf,644); \ $(call do_install,bpf_endian.h,$(prefix)/include/bpf,644); \ $(call do_install,bpf_core_read.h,$(prefix)/include/bpf,644); @@ -271,7 +272,7 @@ config-clean: clean: $(call QUIET_CLEAN, libbpf) $(RM) -rf $(CMD_TARGETS) \ *.o *~ *.a *.so *.so.$(LIBBPF_MAJOR_VERSION) .*.d .*.cmd \ - *.pc LIBBPF-CFLAGS bpf_helper_defs.h \ + *.pc LIBBPF-CFLAGS $(BPF_HELPER_DEFS) \ $(SHARED_OBJDIR) $(STATIC_OBJDIR) $(call QUIET_CLEAN, core-gen) $(RM) $(OUTPUT)FEATURE-DUMP.libbpf diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 419652458da4..1ff0a9f49c01 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -40,3 +40,4 @@ xdping test_cpp /no_alu32 /bpf_gcc +bpf_helper_defs.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e0fe01d9ec33..e2fd6f8d579c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -120,9 +120,9 @@ force: $(BPFOBJ): force $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ -BPF_HELPERS := $(BPFDIR)/bpf_helper_defs.h $(wildcard $(BPFDIR)/bpf_*.h) -$(BPFDIR)/bpf_helper_defs.h: - $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ bpf_helper_defs.h +BPF_HELPERS := $(OUTPUT)/bpf_helper_defs.h $(wildcard $(BPFDIR)/bpf_*.h) +$(OUTPUT)/bpf_helper_defs.h: + $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ $(OUTPUT)/bpf_helper_defs.h # Get Clang's default includes on this system, as opposed to those seen by # '-target bpf'. This fixes "missing" files on some architectures/distros, From 7935799e041ae10d380d04ea23868240f082bd11 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 17 Dec 2019 20:04:51 -0700 Subject: [PATCH 048/118] cifs: Adjust indentation in smb2_open_file Clang warns: ../fs/cifs/smb2file.c:70:3: warning: misleading indentation; statement is not part of the previous 'if' [-Wmisleading-indentation] if (oparms->tcon->use_resilient) { ^ ../fs/cifs/smb2file.c:66:2: note: previous statement is here if (rc) ^ 1 warning generated. This warning occurs because there is a space after the tab on this line. Remove it so that the indentation is consistent with the Linux kernel coding style and clang no longer warns. Fixes: 592fafe644bf ("Add resilienthandles mount parm") Link: https://github.com/ClangBuiltLinux/linux/issues/826 Signed-off-by: Nathan Chancellor Signed-off-by: Steve French --- fs/cifs/smb2file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 8b0b512c5792..afe1f03aabe3 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -67,7 +67,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, goto out; - if (oparms->tcon->use_resilient) { + if (oparms->tcon->use_resilient) { /* default timeout is 0, servers pick default (120 seconds) */ nr_ioctl_req.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); From 046aca3c25fd28da591f59a2dc1a01848e81e0b2 Mon Sep 17 00:00:00 2001 From: "Paulo Alcantara (SUSE)" Date: Wed, 18 Dec 2019 18:11:37 -0300 Subject: [PATCH 049/118] cifs: Optimize readdir on reparse points When listing a directory with thounsands of files and most of them are reparse points, we simply marked all those dentries for revalidation and then sending additional (compounded) create/getinfo/close requests for each of them. Instead, upon receiving a response from an SMB2_QUERY_DIRECTORY (FileIdFullDirectoryInformation) command, the directory entries that have a file attribute of FILE_ATTRIBUTE_REPARSE_POINT will contain an EaSize field with a reparse tag in it, so we parse it and mark the dentry for revalidation only if it is a DFS or a symlink. Signed-off-by: Paulo Alcantara (SUSE) Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 1 + fs/cifs/readdir.c | 63 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ce9bac756c2a..40705e862451 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1693,6 +1693,7 @@ struct cifs_fattr { struct timespec64 cf_atime; struct timespec64 cf_mtime; struct timespec64 cf_ctime; + u32 cf_cifstag; }; static inline void free_dfs_info_param(struct dfs_info3_param *param) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 3925a7bfc74d..d17587c2c4ab 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -139,6 +139,28 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, dput(dentry); } +static bool reparse_file_needs_reval(const struct cifs_fattr *fattr) +{ + if (!(fattr->cf_cifsattrs & ATTR_REPARSE)) + return false; + /* + * The DFS tags should be only intepreted by server side as per + * MS-FSCC 2.1.2.1, but let's include them anyway. + * + * Besides, if cf_cifstag is unset (0), then we still need it to be + * revalidated to know exactly what reparse point it is. + */ + switch (fattr->cf_cifstag) { + case IO_REPARSE_TAG_DFS: + case IO_REPARSE_TAG_DFSR: + case IO_REPARSE_TAG_SYMLINK: + case IO_REPARSE_TAG_NFS: + case 0: + return true; + } + return false; +} + static void cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) { @@ -158,7 +180,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) * is a symbolic link, DFS referral or a reparse point with a direct * access like junctions, deduplicated files, NFS symlinks. */ - if (fattr->cf_cifsattrs & ATTR_REPARSE) + if (reparse_file_needs_reval(fattr)) fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; /* non-unix readdir doesn't provide nlink */ @@ -194,19 +216,37 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) } } +static void __dir_info_to_fattr(struct cifs_fattr *fattr, const void *info) +{ + const FILE_DIRECTORY_INFO *fi = info; + + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_cifsattrs = le32_to_cpu(fi->ExtFileAttributes); + fattr->cf_eof = le64_to_cpu(fi->EndOfFile); + fattr->cf_bytes = le64_to_cpu(fi->AllocationSize); + fattr->cf_createtime = le64_to_cpu(fi->CreationTime); + fattr->cf_atime = cifs_NTtimeToUnix(fi->LastAccessTime); + fattr->cf_ctime = cifs_NTtimeToUnix(fi->ChangeTime); + fattr->cf_mtime = cifs_NTtimeToUnix(fi->LastWriteTime); +} + void cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, struct cifs_sb_info *cifs_sb) { - memset(fattr, 0, sizeof(*fattr)); - fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes); - fattr->cf_eof = le64_to_cpu(info->EndOfFile); - fattr->cf_bytes = le64_to_cpu(info->AllocationSize); - fattr->cf_createtime = le64_to_cpu(info->CreationTime); - fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); - fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); - fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); + __dir_info_to_fattr(fattr, info); + cifs_fill_common_info(fattr, cifs_sb); +} +static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr, + SEARCH_ID_FULL_DIR_INFO *info, + struct cifs_sb_info *cifs_sb) +{ + __dir_info_to_fattr(fattr, info); + + /* See MS-FSCC 2.4.18 FileIdFullDirectoryInformation */ + if (fattr->cf_cifsattrs & ATTR_REPARSE) + fattr->cf_cifstag = le32_to_cpu(info->EaSize); cifs_fill_common_info(fattr, cifs_sb); } @@ -755,6 +795,11 @@ static int cifs_filldir(char *find_entry, struct file *file, (FIND_FILE_STANDARD_INFO *)find_entry, cifs_sb); break; + case SMB_FIND_FILE_ID_FULL_DIR_INFO: + cifs_fulldir_info_to_fattr(&fattr, + (SEARCH_ID_FULL_DIR_INFO *)find_entry, + cifs_sb); + break; default: cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)find_entry, From 1f424e8bd18754d27b15f49359004b0cea344fb5 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Sun, 22 Dec 2019 22:46:54 +0800 Subject: [PATCH 050/118] io-wq: remove unused busy list from io_sqe Commit e61df66c69b1 ("io-wq: ensure free/busy list browsing see all items") added a list for io workers in addition to the free and busy lists, not only making worker walk cleaner, but leaving the busy list unused. Let's remove it. Signed-off-by: Hillf Danton Signed-off-by: Jens Axboe --- fs/io-wq.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 11e80b7252a8..a1c85458f021 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -92,7 +92,6 @@ struct io_wqe { struct io_wqe_acct acct[2]; struct hlist_nulls_head free_list; - struct hlist_nulls_head busy_list; struct list_head all_list; struct io_wq *wq; @@ -327,7 +326,6 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; hlist_nulls_del_init_rcu(&worker->nulls_node); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->busy_list); } /* @@ -365,7 +363,6 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) { if (!(worker->flags & IO_WORKER_F_FREE)) { worker->flags |= IO_WORKER_F_FREE; - hlist_nulls_del_init_rcu(&worker->nulls_node); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); } @@ -798,10 +795,6 @@ void io_wq_cancel_all(struct io_wq *wq) set_bit(IO_WQ_BIT_CANCEL, &wq->state); - /* - * Browse both lists, as there's a gap between handing work off - * to a worker and the worker putting itself on the busy_list - */ rcu_read_lock(); for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; @@ -1049,7 +1042,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); - INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1); INIT_LIST_HEAD(&wqe->all_list); } From 7648dd9fc981cec52df8a635fa2eefc249e138b9 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Tue, 10 Dec 2019 17:12:33 -0700 Subject: [PATCH 051/118] selftests: filesystems/epoll: fix build error epoll build fails to find pthread lib. Fix Makefile to use LDLIBS instead of LDFLAGS. LDLIBS is the right flag to use here with -l option when invoking ld. gcc -I../../../../../usr/include/ -lpthread epoll_wakeup_test.c -o .../tools/testing/selftests/filesystems/epoll/epoll_wakeup_test /usr/bin/ld: /tmp/ccaZvJUl.o: in function `kill_timeout': epoll_wakeup_test.c:(.text+0x4dd): undefined reference to `pthread_kill' /usr/bin/ld: epoll_wakeup_test.c:(.text+0x4f2): undefined reference to `pthread_kill' /usr/bin/ld: /tmp/ccaZvJUl.o: in function `epoll9': epoll_wakeup_test.c:(.text+0x6382): undefined reference to `pthread_create' /usr/bin/ld: epoll_wakeup_test.c:(.text+0x64d2): undefined reference to `pthread_create' /usr/bin/ld: epoll_wakeup_test.c:(.text+0x6626): undefined reference to `pthread_join' /usr/bin/ld: epoll_wakeup_test.c:(.text+0x684c): undefined reference to `pthread_tryjoin_np' /usr/bin/ld: epoll_wakeup_test.c:(.text+0x6864): undefined reference to `pthread_kill' /usr/bin/ld: epoll_wakeup_test.c:(.text+0x6878): undefined reference to `pthread_join' Signed-off-by: Shuah Khan --- tools/testing/selftests/filesystems/epoll/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/filesystems/epoll/Makefile b/tools/testing/selftests/filesystems/epoll/Makefile index e62f3d4f68da..78ae4aaf7141 100644 --- a/tools/testing/selftests/filesystems/epoll/Makefile +++ b/tools/testing/selftests/filesystems/epoll/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += -I../../../../../usr/include/ -LDFLAGS += -lpthread +LDLIBS += -lpthread TEST_GEN_PROGS := epoll_wakeup_test include ../../lib.mk From c65e41538b04e0d64a673828745a00cb68a24371 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 12 Dec 2019 18:56:06 -0700 Subject: [PATCH 052/118] selftests: firmware: Fix it to do root uid check and skip firmware attempts to load test modules that require root access and fail. Fix it to check for root uid and exit with skip code instead. Before this fix: selftests: firmware: fw_run_tests.sh modprobe: ERROR: could not insert 'test_firmware': Operation not permitted You must have the following enabled in your kernel: CONFIG_TEST_FIRMWARE=y CONFIG_FW_LOADER=y CONFIG_FW_LOADER_USER_HELPER=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y not ok 1 selftests: firmware: fw_run_tests.sh # SKIP With this fix: selftests: firmware: fw_run_tests.sh skip all tests: must be run as root not ok 1 selftests: firmware: fw_run_tests.sh # SKIP Signed-off-by: Shuah Khan Reviwed-by: Greg Kroah-Hartman Signed-off-by: Shuah Khan --- tools/testing/selftests/firmware/fw_lib.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/firmware/fw_lib.sh b/tools/testing/selftests/firmware/fw_lib.sh index b879305a766d..5b8c0fedee76 100755 --- a/tools/testing/selftests/firmware/fw_lib.sh +++ b/tools/testing/selftests/firmware/fw_lib.sh @@ -34,6 +34,12 @@ test_modprobe() check_mods() { + local uid=$(id -u) + if [ $uid -ne 0 ]; then + echo "skip all tests: must be run as root" >&2 + exit $ksft_skip + fi + trap "test_modprobe" EXIT if [ ! -d $DIR ]; then modprobe test_firmware From 05564c29fc943c467abb1f60ef1f0a68c4c424e9 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 16 Dec 2019 12:18:40 -0700 Subject: [PATCH 053/118] selftests: livepatch: Fix it to do root uid check and skip livepatch test configures the system and debug environment to run tests. Some of these actions fail without root access and test dumps several permission denied messages before it exits. Fix test-state.sh to call setup_config instead of set_dynamic_debug as suggested by Petr Mladek Fix it to check root uid and exit with skip code instead. Signed-off-by: Shuah Khan Acked-by: Joe Lawrence Reviewed-by: Petr Mladek Signed-off-by: Shuah Khan --- tools/testing/selftests/livepatch/functions.sh | 15 ++++++++++++++- tools/testing/selftests/livepatch/test-state.sh | 3 +-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh index 31eb09e38729..a6e3d5517a6f 100644 --- a/tools/testing/selftests/livepatch/functions.sh +++ b/tools/testing/selftests/livepatch/functions.sh @@ -7,6 +7,9 @@ MAX_RETRIES=600 RETRY_INTERVAL=".1" # seconds +# Kselftest framework requirement - SKIP code is 4 +ksft_skip=4 + # log(msg) - write message to kernel log # msg - insightful words function log() { @@ -18,7 +21,16 @@ function log() { function skip() { log "SKIP: $1" echo "SKIP: $1" >&2 - exit 4 + exit $ksft_skip +} + +# root test +function is_root() { + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo "skip all tests: must be run as root" >&2 + exit $ksft_skip + fi } # die(msg) - game over, man @@ -62,6 +74,7 @@ function set_ftrace_enabled() { # for verbose livepatching output and turn on # the ftrace_enabled sysctl. function setup_config() { + is_root push_config set_dynamic_debug set_ftrace_enabled 1 diff --git a/tools/testing/selftests/livepatch/test-state.sh b/tools/testing/selftests/livepatch/test-state.sh index dc2908c22c26..a08212708115 100755 --- a/tools/testing/selftests/livepatch/test-state.sh +++ b/tools/testing/selftests/livepatch/test-state.sh @@ -8,8 +8,7 @@ MOD_LIVEPATCH=test_klp_state MOD_LIVEPATCH2=test_klp_state2 MOD_LIVEPATCH3=test_klp_state3 -set_dynamic_debug - +setup_config # TEST: Loading and removing a module that modifies the system state From 1c68cbadf09f685e1ffc6cc4c9094e67131f9a2f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 20 Dec 2019 05:14:03 +0000 Subject: [PATCH 054/118] docs/kunit/start: Use in-tree 'kunit_defconfig' The kunit doc suggests users to get the default `kunitconfig` from an external git tree. However, the file is already located under the `arch/um/configs/` of the kernel tree. Because the local file is easier to access and maintain, this commit updates the doc to use it. Signed-off-by: SeongJae Park Reviewed-by: Brendan Higgins Tested-by: Brendan Higgins Signed-off-by: Shuah Khan --- Documentation/dev-tools/kunit/start.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/dev-tools/kunit/start.rst b/Documentation/dev-tools/kunit/start.rst index 9d6db892c41c..d5197f1a45cb 100644 --- a/Documentation/dev-tools/kunit/start.rst +++ b/Documentation/dev-tools/kunit/start.rst @@ -32,9 +32,8 @@ regular Kernel config, with the specific test targets as well. .. code-block:: bash - git clone -b master https://kunit.googlesource.com/kunitconfig $PATH_TO_KUNITCONFIG_REPO cd $PATH_TO_LINUX_REPO - ln -s $PATH_TO_KUNIT_CONFIG_REPO/kunitconfig kunitconfig + cp arch/um/configs/kunit_defconfig kunitconfig You may want to add kunitconfig to your local gitignore. From 8c0140facb380da5740182d189be0b0d9deadb3d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 20 Dec 2019 05:14:04 +0000 Subject: [PATCH 055/118] kunit: Remove duplicated defconfig creation '--defconfig' option is handled by the 'main() of the 'kunit.py' but again handled in following 'run_tests()'. This commit removes this duplicated handling of the option in the 'run_tests()'. Signed-off-by: SeongJae Park Reviewed-by: Brendan Higgins Tested-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index efe06d621983..f8f26951cd1b 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -37,9 +37,6 @@ def create_default_kunitconfig(): def run_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitRequest) -> KunitResult: - if request.defconfig: - create_default_kunitconfig() - config_start = time.time() success = linux.build_reconfig(request.build_dir) config_end = time.time() From e3212513a8f07786cb262cf78dfe9c4d0c1b1786 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 20 Dec 2019 05:14:05 +0000 Subject: [PATCH 056/118] kunit: Create default config in '--build_dir' If both '--build_dir' and '--defconfig' are given, the handling of '--defconfig' ignores '--build_dir' option. This commit modifies the behavior to respect '--build_dir' option. Reported-by: Brendan Higgins Suggested-by: Brendan Higgins Signed-off-by: SeongJae Park Reviewed-by: Brendan Higgins Tested-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 11 +++++++++-- tools/testing/kunit/kunit_kernel.py | 4 ++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index f8f26951cd1b..5b222418eacd 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -31,9 +31,9 @@ class KunitStatus(Enum): TEST_FAILURE = auto() def create_default_kunitconfig(): - if not os.path.exists(kunit_kernel.KUNITCONFIG_PATH): + if not os.path.exists(kunit_kernel.kunitconfig_path): shutil.copyfile('arch/um/configs/kunit_defconfig', - kunit_kernel.KUNITCONFIG_PATH) + kunit_kernel.kunitconfig_path) def run_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitRequest) -> KunitResult: @@ -114,6 +114,13 @@ def main(argv, linux=None): cli_args = parser.parse_args(argv) if cli_args.subcommand == 'run': + if cli_args.build_dir: + if not os.path.exists(cli_args.build_dir): + os.mkdir(cli_args.build_dir) + kunit_kernel.kunitconfig_path = os.path.join( + cli_args.build_dir, + kunit_kernel.kunitconfig_path) + if cli_args.defconfig: create_default_kunitconfig() diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index bf3876835331..c04a12e2f711 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -14,7 +14,7 @@ import os import kunit_config KCONFIG_PATH = '.config' -KUNITCONFIG_PATH = 'kunitconfig' +kunitconfig_path = 'kunitconfig' class ConfigError(Exception): """Represents an error trying to configure the Linux kernel.""" @@ -82,7 +82,7 @@ class LinuxSourceTree(object): def __init__(self): self._kconfig = kunit_config.Kconfig() - self._kconfig.read_from_file(KUNITCONFIG_PATH) + self._kconfig.read_from_file(kunitconfig_path) self._ops = LinuxSourceTreeOperations() def clean(self): From 609952c2afc9542b336cf9a72d4a09606e05fa2a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 20 Dec 2019 05:14:06 +0000 Subject: [PATCH 057/118] kunit: Place 'test.log' under the 'build_dir' 'kunit' writes the 'test.log' under the kernel source directory even though a 'build_dir' option is given. As users who use the option might expect the outputs to be placed under the specified directory, this commit modifies the logic to write the log file under the 'build_dir'. Signed-off-by: SeongJae Park Reviewed-by: Brendan Higgins Tested-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 2 +- tools/testing/kunit/kunit_kernel.py | 4 ++-- tools/testing/kunit/kunit_tool_test.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index 5b222418eacd..e4250c4b06fb 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -105,7 +105,7 @@ def main(argv, linux=None): run_parser.add_argument('--build_dir', help='As in the make command, it specifies the build ' 'directory.', - type=str, default=None, metavar='build_dir') + type=str, default='', metavar='build_dir') run_parser.add_argument('--defconfig', help='Uses a default kunitconfig.', diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index c04a12e2f711..a10c0c787bc1 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -140,10 +140,10 @@ class LinuxSourceTree(object): return False return True - def run_kernel(self, args=[], timeout=None, build_dir=None): + def run_kernel(self, args=[], timeout=None, build_dir=''): args.extend(['mem=256M']) process = self._ops.linux_bin(args, timeout, build_dir) - with open('test.log', 'w') as f: + with open(os.path.join(build_dir, 'test.log'), 'w') as f: for line in process.stdout: f.write(line.rstrip().decode('ascii') + '\n') yield line.rstrip().decode('ascii') diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index a2a8ea6beae3..22f16e66b3c1 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -199,7 +199,7 @@ class KUnitMainTest(unittest.TestCase): timeout = 3453 kunit.main(['run', '--timeout', str(timeout)], self.linux_source_mock) assert self.linux_source_mock.build_reconfig.call_count == 1 - self.linux_source_mock.run_kernel.assert_called_once_with(build_dir=None, timeout=timeout) + self.linux_source_mock.run_kernel.assert_called_once_with(build_dir='', timeout=timeout) self.print_mock.assert_any_call(StrContains('Testing complete.')) if __name__ == '__main__': From 14ee5cfd4512ee3d1e0047d8751450dcc6544070 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 20 Dec 2019 05:14:07 +0000 Subject: [PATCH 058/118] kunit: Rename 'kunitconfig' to '.kunitconfig' This commit renames 'kunitconfig' to '.kunitconfig' so that it can be automatically ignored by git and do not disturb people who want to type 'kernel/' by pressing only the 'k' and then 'tab' key. Signed-off-by: SeongJae Park Reviewed-by: Brendan Higgins Tested-by: Brendan Higgins Signed-off-by: Shuah Khan --- Documentation/dev-tools/kunit/start.rst | 12 +++++------- tools/testing/kunit/kunit.py | 2 +- tools/testing/kunit/kunit_kernel.py | 4 ++-- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/Documentation/dev-tools/kunit/start.rst b/Documentation/dev-tools/kunit/start.rst index d5197f1a45cb..4e1d24db6b13 100644 --- a/Documentation/dev-tools/kunit/start.rst +++ b/Documentation/dev-tools/kunit/start.rst @@ -24,18 +24,16 @@ The wrapper can be run with: For more information on this wrapper (also called kunit_tool) checkout the :doc:`kunit-tool` page. -Creating a kunitconfig -====================== +Creating a .kunitconfig +======================= The Python script is a thin wrapper around Kbuild. As such, it needs to be -configured with a ``kunitconfig`` file. This file essentially contains the +configured with a ``.kunitconfig`` file. This file essentially contains the regular Kernel config, with the specific test targets as well. .. code-block:: bash cd $PATH_TO_LINUX_REPO - cp arch/um/configs/kunit_defconfig kunitconfig - -You may want to add kunitconfig to your local gitignore. + cp arch/um/configs/kunit_defconfig .kunitconfig Verifying KUnit Works --------------------- @@ -150,7 +148,7 @@ and the following to ``drivers/misc/Makefile``: obj-$(CONFIG_MISC_EXAMPLE_TEST) += example-test.o -Now add it to your ``kunitconfig``: +Now add it to your ``.kunitconfig``: .. code-block:: none diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index e4250c4b06fb..e59eb9e7f923 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -108,7 +108,7 @@ def main(argv, linux=None): type=str, default='', metavar='build_dir') run_parser.add_argument('--defconfig', - help='Uses a default kunitconfig.', + help='Uses a default .kunitconfig.', action='store_true') cli_args = parser.parse_args(argv) diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index a10c0c787bc1..cc5d844ecca1 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -14,7 +14,7 @@ import os import kunit_config KCONFIG_PATH = '.config' -kunitconfig_path = 'kunitconfig' +kunitconfig_path = '.kunitconfig' class ConfigError(Exception): """Represents an error trying to configure the Linux kernel.""" @@ -111,7 +111,7 @@ class LinuxSourceTree(object): return True def build_reconfig(self, build_dir): - """Creates a new .config if it is not a subset of the kunitconfig.""" + """Creates a new .config if it is not a subset of the .kunitconfig.""" kconfig_path = get_kconfig_path(build_dir) if os.path.exists(kconfig_path): existing_kconfig = kunit_config.Kconfig() From b1b35201bcf411191986eefdef5f5fb5f41cd879 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 20 Dec 2019 05:14:08 +0000 Subject: [PATCH 059/118] kunit/kunit_tool_test: Test '--build_dir' option run This commit adds kunit tool test for the '--build_dir' option. Signed-off-by: SeongJae Park Reviewed-by: Brendan Higgins Tested-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_tool_test.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 22f16e66b3c1..cba97756ac4a 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -174,6 +174,7 @@ class KUnitMainTest(unittest.TestCase): kunit.main(['run'], self.linux_source_mock) assert self.linux_source_mock.build_reconfig.call_count == 1 assert self.linux_source_mock.run_kernel.call_count == 1 + self.linux_source_mock.run_kernel.assert_called_once_with(build_dir='', timeout=300) self.print_mock.assert_any_call(StrContains('Testing complete.')) def test_run_passes_args_fail(self): @@ -202,5 +203,12 @@ class KUnitMainTest(unittest.TestCase): self.linux_source_mock.run_kernel.assert_called_once_with(build_dir='', timeout=timeout) self.print_mock.assert_any_call(StrContains('Testing complete.')) + def test_run_builddir(self): + build_dir = '.kunit' + kunit.main(['run', '--build_dir', build_dir], self.linux_source_mock) + assert self.linux_source_mock.build_reconfig.call_count == 1 + self.linux_source_mock.run_kernel.assert_called_once_with(build_dir=build_dir, timeout=300) + self.print_mock.assert_any_call(StrContains('Testing complete.')) + if __name__ == '__main__': unittest.main() From af9cb29c5488381083b0b5ccdfb3cd931063384a Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 11 Dec 2019 11:28:57 -0500 Subject: [PATCH 060/118] rseq/selftests: Turn off timeout setting As the rseq selftests can run for a long period of time, disable the timeout that the general selftests have. Signed-off-by: Mathieu Desnoyers Cc: Shuah Khan Cc: Thomas Gleixner Cc: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Boqun Feng Cc: "H . Peter Anvin" Cc: Paul Turner Cc: Dmitry Vyukov Signed-off-by: Shuah Khan --- tools/testing/selftests/rseq/settings | 1 + 1 file changed, 1 insertion(+) create mode 100644 tools/testing/selftests/rseq/settings diff --git a/tools/testing/selftests/rseq/settings b/tools/testing/selftests/rseq/settings new file mode 100644 index 000000000000..e7b9417537fb --- /dev/null +++ b/tools/testing/selftests/rseq/settings @@ -0,0 +1 @@ +timeout=0 From 8df34c56321479bfa1ec732c675b686c2b4df412 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 11 Dec 2019 11:17:13 -0500 Subject: [PATCH 061/118] rseq/selftests: Fix: Namespace gettid() for compatibility with glibc 2.30 glibc 2.30 introduces gettid() in public headers, which clashes with the internal static definition within rseq selftests. Rename gettid() to rseq_gettid() to eliminate this symbol name clash. Reported-by: Tommi T. Rantala Signed-off-by: Mathieu Desnoyers Cc: Shuah Khan Cc: Tommi T. Rantala Cc: Thomas Gleixner Cc: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Boqun Feng Cc: "H . Peter Anvin" Cc: Paul Turner Cc: Dmitry Vyukov Cc: # v4.18+ Signed-off-by: Shuah Khan --- tools/testing/selftests/rseq/param_test.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c index eec2663261f2..e8a657a5f48a 100644 --- a/tools/testing/selftests/rseq/param_test.c +++ b/tools/testing/selftests/rseq/param_test.c @@ -15,7 +15,7 @@ #include #include -static inline pid_t gettid(void) +static inline pid_t rseq_gettid(void) { return syscall(__NR_gettid); } @@ -373,11 +373,12 @@ void *test_percpu_spinlock_thread(void *arg) rseq_percpu_unlock(&data->lock, cpu); #ifndef BENCHMARK if (i != 0 && !(i % (reps / 10))) - printf_verbose("tid %d: count %lld\n", (int) gettid(), i); + printf_verbose("tid %d: count %lld\n", + (int) rseq_gettid(), i); #endif } printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n", - (int) gettid(), nr_abort, signals_delivered); + (int) rseq_gettid(), nr_abort, signals_delivered); if (!opt_disable_rseq && thread_data->reg && rseq_unregister_current_thread()) abort(); @@ -454,11 +455,12 @@ void *test_percpu_inc_thread(void *arg) } while (rseq_unlikely(ret)); #ifndef BENCHMARK if (i != 0 && !(i % (reps / 10))) - printf_verbose("tid %d: count %lld\n", (int) gettid(), i); + printf_verbose("tid %d: count %lld\n", + (int) rseq_gettid(), i); #endif } printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n", - (int) gettid(), nr_abort, signals_delivered); + (int) rseq_gettid(), nr_abort, signals_delivered); if (!opt_disable_rseq && thread_data->reg && rseq_unregister_current_thread()) abort(); @@ -605,7 +607,7 @@ void *test_percpu_list_thread(void *arg) } printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n", - (int) gettid(), nr_abort, signals_delivered); + (int) rseq_gettid(), nr_abort, signals_delivered); if (!opt_disable_rseq && rseq_unregister_current_thread()) abort(); @@ -796,7 +798,7 @@ void *test_percpu_buffer_thread(void *arg) } printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n", - (int) gettid(), nr_abort, signals_delivered); + (int) rseq_gettid(), nr_abort, signals_delivered); if (!opt_disable_rseq && rseq_unregister_current_thread()) abort(); @@ -1011,7 +1013,7 @@ void *test_percpu_memcpy_buffer_thread(void *arg) } printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n", - (int) gettid(), nr_abort, signals_delivered); + (int) rseq_gettid(), nr_abort, signals_delivered); if (!opt_disable_rseq && rseq_unregister_current_thread()) abort(); From 2a1f40adfb54ca65dc4c93bad444dd23b800a76e Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 20 Dec 2019 15:33:18 -0500 Subject: [PATCH 062/118] rseq/selftests: Clarify rseq_prepare_unload() helper requirements The rseq.h UAPI now documents that the rseq_cs field must be cleared before reclaiming memory that contains the targeted struct rseq_cs, but also that the rseq_cs field must be cleared before reclaiming memory of the code pointed to by the rseq_cs start_ip and post_commit_offset fields. While we can expect that use of dlclose(3) will typically unmap both struct rseq_cs and its associated code at once, nothing would theoretically prevent a JIT from reclaiming the code without reclaiming the struct rseq_cs, which would erroneously allow the kernel to consider new code which is not a rseq critical section as a rseq critical section following a code reclaim. Suggested-by: Florian Weimer Signed-off-by: Mathieu Desnoyers Cc: Shuah Khan Cc: Florian Weimer Cc: Thomas Gleixner Cc: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Boqun Feng Cc: "H . Peter Anvin" Cc: Paul Turner Cc: Dmitry Vyukov Signed-off-by: Shuah Khan --- tools/testing/selftests/rseq/rseq.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index d40d60e7499e..3f63eb362b92 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -149,11 +149,13 @@ static inline void rseq_clear_rseq_cs(void) /* * rseq_prepare_unload() should be invoked by each thread executing a rseq * critical section at least once between their last critical section and - * library unload of the library defining the rseq critical section - * (struct rseq_cs). This also applies to use of rseq in code generated by - * JIT: rseq_prepare_unload() should be invoked at least once by each - * thread executing a rseq critical section before reclaim of the memory - * holding the struct rseq_cs. + * library unload of the library defining the rseq critical section (struct + * rseq_cs) or the code referred to by the struct rseq_cs start_ip and + * post_commit_offset fields. This also applies to use of rseq in code + * generated by JIT: rseq_prepare_unload() should be invoked at least once by + * each thread executing a rseq critical section before reclaim of the memory + * holding the struct rseq_cs or reclaim of the code pointed to by struct + * rseq_cs start_ip and post_commit_offset fields. */ static inline void rseq_prepare_unload(void) { From fd1c4bc6e9b34a5e4fe7a3130a49380ef9d7037c Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 24 Dec 2019 09:14:29 -0700 Subject: [PATCH 063/118] io-wq: add cond_resched() to worker thread Reschedule the current IO worker to cut the risk that it is becoming a cpu hog. Signed-off-by: Hillf Danton Signed-off-by: Jens Axboe --- fs/io-wq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io-wq.c b/fs/io-wq.c index a1c85458f021..541c8a3e0bbb 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -429,6 +429,8 @@ static void io_worker_handle_work(struct io_worker *worker) if (signal_pending(current)) flush_signals(current); + cond_resched(); + spin_lock_irq(&worker->lock); worker->cur_work = work; spin_unlock_irq(&worker->lock); From feed8a4fc9d46c3126fb9fcae0e9248270c6321a Mon Sep 17 00:00:00 2001 From: Antonio Messina Date: Thu, 19 Dec 2019 15:08:03 +0100 Subject: [PATCH 064/118] udp: fix integer overflow while computing available space in sk_rcvbuf When the size of the receive buffer for a socket is close to 2^31 when computing if we have enough space in the buffer to copy a packet from the queue to the buffer we might hit an integer overflow. When an user set net.core.rmem_default to a value close to 2^31 UDP packets are dropped because of this overflow. This can be visible, for instance, with failure to resolve hostnames. This can be fixed by casting sk_rcvbuf (which is an int) to unsigned int, similarly to how it is done in TCP. Signed-off-by: Antonio Messina Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4da5758cc718..93a355b6b092 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1475,7 +1475,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * queue contains some other skb */ rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > (size + sk->sk_rcvbuf)) + if (rmem > (size + (unsigned int)sk->sk_rcvbuf)) goto uncharge_drop; spin_lock(&list->lock); From 61d5d4062876e21331c3d0ba4b02dbd50c06a658 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 20 Dec 2019 15:03:44 -0300 Subject: [PATCH 065/118] sctp: fix err handling of stream initialization The fix on 951c6db954a1 fixed the issued reported there but introduced another. When the allocation fails within sctp_stream_init() it is okay/necessary to free the genradix. But it is also called when adding new streams, from sctp_send_add_streams() and sctp_process_strreset_addstrm_in() and in those situations it cannot just free the genradix because by then it is a fully operational association. The fix here then is to only free the genradix in sctp_stream_init() and on those other call sites move on with what it already had and let the subsequent error handling to handle it. Tested with the reproducers from this report and the previous one, with lksctp-tools and sctp-tests. Reported-by: syzbot+9a1bc632e78a1a98488b@syzkaller.appspotmail.com Fixes: 951c6db954a1 ("sctp: fix memleak on err handling of stream initialization") Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 6a30392068a0..c1a100d2fed3 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -84,10 +84,8 @@ static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, return 0; ret = genradix_prealloc(&stream->out, outcnt, gfp); - if (ret) { - genradix_free(&stream->out); + if (ret) return ret; - } stream->outcnt = outcnt; return 0; @@ -102,10 +100,8 @@ static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, return 0; ret = genradix_prealloc(&stream->in, incnt, gfp); - if (ret) { - genradix_free(&stream->in); + if (ret) return ret; - } stream->incnt = incnt; return 0; @@ -123,7 +119,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, * a new one with new outcnt to save memory if needed. */ if (outcnt == stream->outcnt) - goto in; + goto handle_in; /* Filter out chunks queued on streams that won't exist anymore */ sched->unsched_all(stream); @@ -132,24 +128,28 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, ret = sctp_stream_alloc_out(stream, outcnt, gfp); if (ret) - goto out; + goto out_err; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; -in: +handle_in: sctp_stream_interleave_init(stream); if (!incnt) goto out; ret = sctp_stream_alloc_in(stream, incnt, gfp); - if (ret) { - sched->free(stream); - genradix_free(&stream->out); - stream->outcnt = 0; - goto out; - } + if (ret) + goto in_err; + goto out; + +in_err: + sched->free(stream); + genradix_free(&stream->in); +out_err: + genradix_free(&stream->out); + stream->outcnt = 0; out: return ret; } From 7c3125f0a6ebc17846c5908ad7d6056d66c1c426 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 20 Dec 2019 11:24:21 -0800 Subject: [PATCH 066/118] net: dsa: bcm_sf2: Fix IP fragment location and behavior The IP fragment is specified through user-defined field as the first bit of the first user-defined word. We were previously trying to extract it from the user-defined mask which could not possibly work. The ip_frag is also supposed to be a boolean, if we do not cast it as such, we risk overwriting the next fields in CFP_DATA(6) which would render the rule inoperative. Fixes: 7318166cacad ("net: dsa: bcm_sf2: Add support for ethtool::rxnfc") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2_cfp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c index f3f0c3f07391..1962c8330daa 100644 --- a/drivers/net/dsa/bcm_sf2_cfp.c +++ b/drivers/net/dsa/bcm_sf2_cfp.c @@ -358,7 +358,7 @@ static int bcm_sf2_cfp_ipv4_rule_set(struct bcm_sf2_priv *priv, int port, return -EINVAL; } - ip_frag = be32_to_cpu(fs->m_ext.data[0]); + ip_frag = !!(be32_to_cpu(fs->h_ext.data[0]) & 1); /* Locate the first rule available */ if (fs->location == RX_CLS_LOC_ANY) @@ -569,7 +569,7 @@ static int bcm_sf2_cfp_rule_cmp(struct bcm_sf2_priv *priv, int port, if (rule->fs.flow_type != fs->flow_type || rule->fs.ring_cookie != fs->ring_cookie || - rule->fs.m_ext.data[0] != fs->m_ext.data[0]) + rule->fs.h_ext.data[0] != fs->h_ext.data[0]) continue; switch (fs->flow_type & ~FLOW_EXT) { @@ -621,7 +621,7 @@ static int bcm_sf2_cfp_ipv6_rule_set(struct bcm_sf2_priv *priv, int port, return -EINVAL; } - ip_frag = be32_to_cpu(fs->m_ext.data[0]); + ip_frag = !!(be32_to_cpu(fs->h_ext.data[0]) & 1); layout = &udf_tcpip6_layout; slice_num = bcm_sf2_get_slice_number(layout, 0); From bd085ef678b2cc8c38c105673dfe8ff8f5ec0c57 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:09 +0800 Subject: [PATCH 067/118] net: add bool confirm_neigh parameter for dst_ops.update_pmtu The MTU update code is supposed to be invoked in response to real networking events that update the PMTU. In IPv6 PMTU update function __ip6_rt_update_pmtu() we called dst_confirm_neigh() to update neighbor confirmed time. But for tunnel code, it will call pmtu before xmit, like: - tnl_update_pmtu() - skb_dst_update_pmtu() - ip6_rt_update_pmtu() - __ip6_rt_update_pmtu() - dst_confirm_neigh() If the tunnel remote dst mac address changed and we still do the neigh confirm, we will not be able to update neigh cache and ping6 remote will failed. So for this ip_tunnel_xmit() case, _EVEN_ if the MTU is changed, we should not be invoking dst_confirm_neigh() as we have no evidence of successful two-way communication at this point. On the other hand it is also important to keep the neigh reachability fresh for TCP flows, so we cannot remove this dst_confirm_neigh() call. To fix the issue, we have to add a new bool parameter for dst_ops.update_pmtu to choose whether we should do neigh update or not. I will add the parameter in this patch and set all the callers to true to comply with the previous way, and fix the tunnel code one by one on later patches. v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Suggested-by: David Miller Reviewed-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/gtp.c | 2 +- include/net/dst.h | 2 +- include/net/dst_ops.h | 3 ++- net/bridge/br_nf_core.c | 3 ++- net/decnet/dn_route.c | 6 ++++-- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/route.c | 9 ++++++--- net/ipv4/xfrm4_policy.c | 5 +++-- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_gre.c | 2 +- net/ipv6/route.c | 22 +++++++++++++++------- net/ipv6/xfrm6_policy.c | 5 +++-- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- net/sctp/transport.c | 2 +- 14 files changed, 42 insertions(+), 25 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index e5b7d6d2286e..913062017be9 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -540,7 +540,7 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, mtu = dst_mtu(&rt->dst); } - rt->dst.ops->update_pmtu(&rt->dst, NULL, skb, mtu); + rt->dst.ops->update_pmtu(&rt->dst, NULL, skb, mtu, true); if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) && mtu < ntohs(iph->tot_len)) { diff --git a/include/net/dst.h b/include/net/dst.h index 8224dad2ae94..593630e0e076 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -516,7 +516,7 @@ static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops->update_pmtu) - dst->ops->update_pmtu(dst, NULL, skb, mtu); + dst->ops->update_pmtu(dst, NULL, skb, mtu, true); } static inline void skb_tunnel_check_pmtu(struct sk_buff *skb, diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 5ec645f27ee3..443863c7b8da 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -27,7 +27,8 @@ struct dst_ops { struct dst_entry * (*negative_advice)(struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu); + struct sk_buff *skb, u32 mtu, + bool confirm_neigh); void (*redirect)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); int (*local_out)(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c index 2cdfc5d6c25d..8c69f0c95a8e 100644 --- a/net/bridge/br_nf_core.c +++ b/net/bridge/br_nf_core.c @@ -22,7 +22,8 @@ #endif static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index aea918135ec3..08c3dc45f1a4 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -110,7 +110,8 @@ static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); static void dn_dst_link_failure(struct sk_buff *); static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb , u32 mtu); + struct sk_buff *skb , u32 mtu, + bool confirm_neigh); static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst, @@ -251,7 +252,8 @@ static int dn_dst_gc(struct dst_ops *ops) * advertise to the other end). */ static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { struct dn_route *rt = (struct dn_route *) dst; struct neighbour *n = rt->n; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e4c6e8b40490..18c0d5bffe12 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1086,7 +1086,7 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) if (!dst) goto out; } - dst->ops->update_pmtu(dst, sk, NULL, mtu); + dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = __sk_dst_check(sk, 0); if (!dst) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f88c93c38f11..87e979f2b74a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -139,7 +139,8 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu); + struct sk_buff *skb, u32 mtu, + bool confirm_neigh); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); @@ -1043,7 +1044,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { struct rtable *rt = (struct rtable *) dst; struct flowi4 fl4; @@ -2687,7 +2689,8 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) } static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 35b84b52b702..9ebd54752e03 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -100,12 +100,13 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, } static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, sk, skb, mtu); + path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index fe9cb8d1adca..e315526fa244 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -146,7 +146,7 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) if (IS_ERR(dst)) return NULL; - dst->ops->update_pmtu(dst, sk, NULL, mtu); + dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 9d0965252ddf..3ba69174ad6c 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1040,7 +1040,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu); + dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, true); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, NEXTHDR_GRE); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b59940416cb5..affb51c11a25 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -95,7 +95,8 @@ static int ip6_pkt_prohibit(struct sk_buff *skb); static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu); + struct sk_buff *skb, u32 mtu, + bool confirm_neigh); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, @@ -264,7 +265,8 @@ static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) } static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { } @@ -2692,7 +2694,8 @@ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, - const struct ipv6hdr *iph, u32 mtu) + const struct ipv6hdr *iph, u32 mtu, + bool confirm_neigh) { const struct in6_addr *daddr, *saddr; struct rt6_info *rt6 = (struct rt6_info *)dst; @@ -2710,7 +2713,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, daddr = NULL; saddr = NULL; } - dst_confirm_neigh(dst, daddr); + + if (confirm_neigh) + dst_confirm_neigh(dst, daddr); + mtu = max_t(u32, mtu, IPV6_MIN_MTU); if (mtu >= dst_mtu(dst)) return; @@ -2764,9 +2770,11 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, } static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { - __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); + __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, + confirm_neigh); } void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, @@ -2785,7 +2793,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) - __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); + __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 699e0730ce8e..af7a4b8b1e9c 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -98,12 +98,13 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, } static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, sk, skb, mtu); + path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk, diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index b1e300f8881b..b00866d777fe 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -208,7 +208,7 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) struct rtable *ort = skb_rtable(skb); if (!skb->dev && sk && sk_fullsock(sk)) - ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); } static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 7235a6032671..3bbe1a58ec87 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -263,7 +263,7 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) pf->af->from_sk(&addr, sk); pf->to_sk_daddr(&t->ipaddr, sk); - dst->ops->update_pmtu(dst, sk, NULL, pmtu); + dst->ops->update_pmtu(dst, sk, NULL, pmtu, true); pf->to_sk_daddr(&addr, sk); dst = sctp_transport_dst_check(t); From 675d76ad0ad5bf41c9a129772ef0aba8f57ea9a7 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:10 +0800 Subject: [PATCH 068/118] ip6_gre: do not confirm neighbor when do pmtu update When we do ipv6 gre pmtu update, we will also do neigh confirm currently. This will cause the neigh cache be refreshed and set to REACHABLE before xmit. But if the remote mac address changed, e.g. device is deleted and recreated, we will not able to notice this and still use the old mac address as the neigh cache is REACHABLE. Fix this by disable neigh confirm when do pmtu update v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Reported-by: Jianlin Shi Reviewed-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3ba69174ad6c..ee968d980746 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1040,7 +1040,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, true); + dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, NEXTHDR_GRE); From 6e9105c73f8d2163d12d5dfd762fd75483ed30f5 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:11 +0800 Subject: [PATCH 069/118] gtp: do not confirm neighbor when do pmtu update When do IPv6 tunnel PMTU update and calls __ip6_rt_update_pmtu() in the end, we should not call dst_confirm_neigh() as there is no two-way communication. Although GTP only support ipv4 right now, and __ip_rt_update_pmtu() does not call dst_confirm_neigh(), we still set it to false to keep consistency with IPv6 code. v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Reviewed-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/gtp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 913062017be9..fca471e27f39 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -540,7 +540,7 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, mtu = dst_mtu(&rt->dst); } - rt->dst.ops->update_pmtu(&rt->dst, NULL, skb, mtu, true); + rt->dst.ops->update_pmtu(&rt->dst, NULL, skb, mtu, false); if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) && mtu < ntohs(iph->tot_len)) { From 07dc35c6e3cc3c001915d05f5bf21f80a39a0970 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:12 +0800 Subject: [PATCH 070/118] net/dst: add new function skb_dst_update_pmtu_no_confirm Add a new function skb_dst_update_pmtu_no_confirm() for callers who need update pmtu but should not do neighbor confirm. v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Reviewed-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/dst.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/net/dst.h b/include/net/dst.h index 593630e0e076..dc7cc1f1051c 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -519,6 +519,15 @@ static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, true); } +/* update dst pmtu but not do neighbor confirm */ +static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) +{ + struct dst_entry *dst = skb_dst(skb); + + if (dst && dst->ops->update_pmtu) + dst->ops->update_pmtu(dst, NULL, skb, mtu, false); +} + static inline void skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom) From 7a1592bcb15d71400a98632727791d1e68ea0ee8 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:13 +0800 Subject: [PATCH 071/118] tunnel: do not confirm neighbor when do pmtu update When do tunnel PMTU update and calls __ip6_rt_update_pmtu() in the end, we should not call dst_confirm_neigh() as there is no two-way communication. v5: No Change. v4: Update commit description v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Fixes: 0dec879f636f ("net: use dst_confirm_neigh for UDP, RAW, ICMP, L2TP") Reviewed-by: Guillaume Nault Tested-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 2 +- net/ipv6/ip6_tunnel.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 38c02bb62e2c..0fe2a5d3e258 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -505,7 +505,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; if (skb_valid_dst(skb)) - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!skb_is_gso(skb) && diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 754a484d35df..2f376dbc37d5 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -640,7 +640,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb_dst_update_pmtu(skb2, rel_info); + skb_dst_update_pmtu_no_confirm(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -1132,7 +1132,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? IPV6_MIN_MTU : IPV4_MIN_MTU); - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; From 8247a79efa2f28b44329f363272550c1738377de Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:14 +0800 Subject: [PATCH 072/118] vti: do not confirm neighbor when do pmtu update When do IPv6 tunnel PMTU update and calls __ip6_rt_update_pmtu() in the end, we should not call dst_confirm_neigh() as there is no two-way communication. Although vti and vti6 are immune to this problem because they are IFF_NOARP interfaces, as Guillaume pointed. There is still no sense to confirm neighbour here. v5: Update commit description. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Reviewed-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv4/ip_vti.c | 2 +- net/ipv6/ip6_vti.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 9b153c7fcbb4..e90b600c7a25 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -214,7 +214,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, mtu = dst_mtu(dst); if (skb->len > mtu) { - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 024db17386d2..6f08b760c2a7 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -479,7 +479,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) mtu = dst_mtu(dst); if (skb->len > mtu) { - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) From 4d42df46d6372ece4cb4279870b46c2ea7304a47 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:15 +0800 Subject: [PATCH 073/118] sit: do not confirm neighbor when do pmtu update When do IPv6 tunnel PMTU update and calls __ip6_rt_update_pmtu() in the end, we should not call dst_confirm_neigh() as there is no two-way communication. v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Reviewed-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/sit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index b2ccbc473127..98954830c40b 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -944,7 +944,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } if (tunnel->parms.iph.daddr) - skb_dst_update_pmtu(skb, mtu); + skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); From f081042d128a0c7acbd67611def62e1b52e2d294 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 22 Dec 2019 10:51:16 +0800 Subject: [PATCH 074/118] net/dst: do not confirm neighbor for vxlan and geneve pmtu update When do IPv6 tunnel PMTU update and calls __ip6_rt_update_pmtu() in the end, we should not call dst_confirm_neigh() as there is no two-way communication. So disable the neigh confirm for vxlan and geneve pmtu update. v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Fixes: a93bf0ff4490 ("vxlan: update skb dst pmtu on tx path") Fixes: 52a589d51f10 ("geneve: update skb dst pmtu on tx path") Reviewed-by: Guillaume Nault Tested-by: Guillaume Nault Acked-by: David Ahern Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/dst.h b/include/net/dst.h index dc7cc1f1051c..3448cf865ede 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -535,7 +535,7 @@ static inline void skb_tunnel_check_pmtu(struct sk_buff *skb, u32 encap_mtu = dst_mtu(encap_dst); if (skb->len > encap_mtu - headroom) - skb_dst_update_pmtu(skb, encap_mtu - headroom); + skb_dst_update_pmtu_no_confirm(skb, encap_mtu - headroom); } #endif /* _NET_DST_H */ From 8b5026bc16938920e4780b9094c3bf20e1e0939d Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 23 Dec 2019 15:03:21 +0100 Subject: [PATCH 075/118] s390/qeth: fix qdio teardown after early init error qeth_l?_set_online() goes through a number of initialization steps, and on any error uses qeth_l?_stop_card() to tear down the residual state. The first initialization step is qeth_core_hardsetup_card(). When this fails after having established a QDIO context on the device (ie. somewhere after qeth_mpc_initialize()), qeth_l?_stop_card() doesn't shut down this QDIO context again (since the card state hasn't progressed from DOWN at this stage). Even worse, we then call qdio_free() as final teardown step to free the QDIO data structures - while some of them are still hooked into wider QDIO infrastructure such as the IRQ list. This is inevitably followed by use-after-frees and other nastyness. Fix this by unconditionally calling qeth_qdio_clear_card() to shut down the QDIO context, and also to halt/clear any pending activity on the various IO channels. Remove the naive attempt at handling the teardown in qeth_mpc_initialize(), it clearly doesn't suffice and we're handling it properly now in the wider teardown code. Fixes: 4a71df50047f ("qeth: new qeth device driver") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 20 ++++++++------------ drivers/s390/net/qeth_l2_main.c | 2 +- drivers/s390/net/qeth_l3_main.c | 2 +- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index bc4158888af9..324cf22f9111 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -2482,50 +2482,46 @@ static int qeth_mpc_initialize(struct qeth_card *card) rc = qeth_cm_enable(card); if (rc) { QETH_CARD_TEXT_(card, 2, "2err%d", rc); - goto out_qdio; + return rc; } rc = qeth_cm_setup(card); if (rc) { QETH_CARD_TEXT_(card, 2, "3err%d", rc); - goto out_qdio; + return rc; } rc = qeth_ulp_enable(card); if (rc) { QETH_CARD_TEXT_(card, 2, "4err%d", rc); - goto out_qdio; + return rc; } rc = qeth_ulp_setup(card); if (rc) { QETH_CARD_TEXT_(card, 2, "5err%d", rc); - goto out_qdio; + return rc; } rc = qeth_alloc_qdio_queues(card); if (rc) { QETH_CARD_TEXT_(card, 2, "5err%d", rc); - goto out_qdio; + return rc; } rc = qeth_qdio_establish(card); if (rc) { QETH_CARD_TEXT_(card, 2, "6err%d", rc); qeth_free_qdio_queues(card); - goto out_qdio; + return rc; } rc = qeth_qdio_activate(card); if (rc) { QETH_CARD_TEXT_(card, 2, "7err%d", rc); - goto out_qdio; + return rc; } rc = qeth_dm_act(card); if (rc) { QETH_CARD_TEXT_(card, 2, "8err%d", rc); - goto out_qdio; + return rc; } return 0; -out_qdio: - qeth_qdio_clear_card(card, !IS_IQD(card)); - qdio_free(CARD_DDEV(card)); - return rc; } void qeth_print_status_message(struct qeth_card *card) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 8c95e6019bac..15e2fd65d434 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -287,12 +287,12 @@ static void qeth_l2_stop_card(struct qeth_card *card) card->state = CARD_STATE_HARDSETUP; } if (card->state == CARD_STATE_HARDSETUP) { - qeth_qdio_clear_card(card, 0); qeth_drain_output_queues(card); qeth_clear_working_pool_list(card); card->state = CARD_STATE_DOWN; } + qeth_qdio_clear_card(card, 0); flush_workqueue(card->event_wq); card->info.mac_bits &= ~QETH_LAYER2_MAC_REGISTERED; card->info.promisc_mode = 0; diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 04e301de376f..5508ab89b518 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -1307,12 +1307,12 @@ static void qeth_l3_stop_card(struct qeth_card *card) card->state = CARD_STATE_HARDSETUP; } if (card->state == CARD_STATE_HARDSETUP) { - qeth_qdio_clear_card(card, 0); qeth_drain_output_queues(card); qeth_clear_working_pool_list(card); card->state = CARD_STATE_DOWN; } + qeth_qdio_clear_card(card, 0); flush_workqueue(card->event_wq); card->info.promisc_mode = 0; } From 5b6c7b55cfe26224b0f41b1c226d3534c542787f Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 23 Dec 2019 15:03:22 +0100 Subject: [PATCH 076/118] s390/qeth: lock the card while changing its hsuid qeth_l3_dev_hsuid_store() initially checks the card state, but doesn't take the conf_mutex to ensure that the card stays in this state while being reconfigured. Rework the code to take this lock, and drop a redundant state check in a helper function. Fixes: b333293058aa ("qeth: add support for af_iucv HiperSockets transport") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 5 ---- drivers/s390/net/qeth_l3_sys.c | 40 +++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 324cf22f9111..c64ef55f0dff 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -3425,11 +3425,6 @@ int qeth_configure_cq(struct qeth_card *card, enum qeth_cq cq) goto out; } - if (card->state != CARD_STATE_DOWN) { - rc = -1; - goto out; - } - qeth_free_qdio_queues(card); card->options.cq = cq; rc = 0; diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c index f9067ed6c7d3..e8c848f72c6d 100644 --- a/drivers/s390/net/qeth_l3_sys.c +++ b/drivers/s390/net/qeth_l3_sys.c @@ -242,21 +242,33 @@ static ssize_t qeth_l3_dev_hsuid_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct qeth_card *card = dev_get_drvdata(dev); + int rc = 0; char *tmp; - int rc; if (!IS_IQD(card)) return -EPERM; - if (card->state != CARD_STATE_DOWN) - return -EPERM; - if (card->options.sniffer) - return -EPERM; - if (card->options.cq == QETH_CQ_NOTAVAILABLE) - return -EPERM; + + mutex_lock(&card->conf_mutex); + if (card->state != CARD_STATE_DOWN) { + rc = -EPERM; + goto out; + } + + if (card->options.sniffer) { + rc = -EPERM; + goto out; + } + + if (card->options.cq == QETH_CQ_NOTAVAILABLE) { + rc = -EPERM; + goto out; + } tmp = strsep((char **)&buf, "\n"); - if (strlen(tmp) > 8) - return -EINVAL; + if (strlen(tmp) > 8) { + rc = -EINVAL; + goto out; + } if (card->options.hsuid[0]) /* delete old ip address */ @@ -267,11 +279,13 @@ static ssize_t qeth_l3_dev_hsuid_store(struct device *dev, card->options.hsuid[0] = '\0'; memcpy(card->dev->perm_addr, card->options.hsuid, 9); qeth_configure_cq(card, QETH_CQ_DISABLED); - return count; + goto out; } - if (qeth_configure_cq(card, QETH_CQ_ENABLED)) - return -EPERM; + if (qeth_configure_cq(card, QETH_CQ_ENABLED)) { + rc = -EPERM; + goto out; + } snprintf(card->options.hsuid, sizeof(card->options.hsuid), "%-8s", tmp); @@ -280,6 +294,8 @@ static ssize_t qeth_l3_dev_hsuid_store(struct device *dev, rc = qeth_l3_modify_hsuid(card, true); +out: + mutex_unlock(&card->conf_mutex); return rc ? rc : count; } From 68c57bfd52836e31bff33e5e1fc64029749d2c35 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Mon, 23 Dec 2019 15:03:23 +0100 Subject: [PATCH 077/118] s390/qeth: fix false reporting of VNIC CHAR config failure Symptom: Error message "Configuring the VNIC characteristics failed" in dmesg whenever an OSA interface on z15 is set online. The VNIC characteristics get re-programmed when setting a L2 device online. This follows the selected 'wanted' characteristics - with the exception that the INVISIBLE characteristic unconditionally gets switched off. For devices that don't support INVISIBLE (ie. OSA), the resulting IO failure raises a noisy error message ("Configuring the VNIC characteristics failed"). For IQD, INVISIBLE is off by default anyways. So don't unnecessarily special-case the INVISIBLE characteristic, and thereby suppress the misleading error message on OSA devices. Fixes: caa1f0b10d18 ("s390/qeth: add VNICC enable/disable support") Signed-off-by: Alexandra Winter Reviewed-by: Julian Wiedmann Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l2_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 15e2fd65d434..fc5d8ed3a737 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -2041,7 +2041,6 @@ static void qeth_l2_vnicc_init(struct qeth_card *card) error |= qeth_l2_vnicc_recover_timeout(card, QETH_VNICC_LEARNING, timeout); chars_tmp = card->options.vnicc.wanted_chars ^ QETH_VNICC_DEFAULT; - chars_tmp |= QETH_VNICC_BRIDGE_INVISIBLE; chars_len = sizeof(card->options.vnicc.wanted_chars) * BITS_PER_BYTE; for_each_set_bit(i, &chars_tmp, chars_len) { vnicc = BIT(i); From e8a66d800471e2df7f0b484e2e46898b21d1fa82 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Mon, 23 Dec 2019 15:03:24 +0100 Subject: [PATCH 078/118] s390/qeth: Fix vnicc_is_in_use if rx_bcast not set Symptom: After vnicc/rx_bcast has been manually set to 0, bridge_* sysfs parameters can still be set or written. Only occurs on HiperSockets, as OSA doesn't support changing rx_bcast. Vnic characteristics and bridgeport settings are mutually exclusive. rx_bcast defaults to 1, so manually setting it to 0 should disable bridge_* parameters. Instead it makes sense here to check the supported mask. If the card does not support vnicc at all, bridge commands are always allowed. Fixes: caa1f0b10d18 ("s390/qeth: add VNICC enable/disable support") Signed-off-by: Alexandra Winter Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l2_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index fc5d8ed3a737..8024a2112a87 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1952,8 +1952,7 @@ int qeth_l2_vnicc_get_timeout(struct qeth_card *card, u32 *timeout) /* check if VNICC is currently enabled */ bool qeth_l2_vnicc_is_in_use(struct qeth_card *card) { - /* if everything is turned off, VNICC is not active */ - if (!card->options.vnicc.cur_chars) + if (!card->options.vnicc.sup_chars) return false; /* default values are only OK if rx_bcast was not enabled by user * or the card is offline. From d1b9ae1864fc3c000e0eb4af8482d78c63e0915a Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Mon, 23 Dec 2019 15:03:25 +0100 Subject: [PATCH 079/118] s390/qeth: vnicc Fix init to default During vnicc_init wanted_char should be compared to cur_char and not to QETH_VNICC_DEFAULT. Without this patch there is no way to enforce the default values as desired values. Note, that it is expected, that a card comes online with default values. This patch was tested with private card firmware. Fixes: caa1f0b10d18 ("s390/qeth: add VNICC enable/disable support") Signed-off-by: Alexandra Winter Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l2_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 8024a2112a87..47d37e75dda6 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -2039,7 +2039,9 @@ static void qeth_l2_vnicc_init(struct qeth_card *card) /* enforce assumed default values and recover settings, if changed */ error |= qeth_l2_vnicc_recover_timeout(card, QETH_VNICC_LEARNING, timeout); - chars_tmp = card->options.vnicc.wanted_chars ^ QETH_VNICC_DEFAULT; + /* Change chars, if necessary */ + chars_tmp = card->options.vnicc.wanted_chars ^ + card->options.vnicc.cur_chars; chars_len = sizeof(card->options.vnicc.wanted_chars) * BITS_PER_BYTE; for_each_set_bit(i, &chars_tmp, chars_len) { vnicc = BIT(i); From 0b698c838e84149b690c7e979f78cccb6f8aa4b9 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 23 Dec 2019 15:03:26 +0100 Subject: [PATCH 080/118] s390/qeth: fix initialization on old HW I stumbled over an old OSA model that claims to support DIAG_ASSIST, but then rejects the cmd to query its DIAG capabilities. In the old code this was ok, as the returned raw error code was > 0. Now that we translate the raw codes to errnos, the "rc < 0" causes us to fail the initialization of the device. The fix is trivial: don't bail out when the DIAG query fails. Such an error is not critical, we can still use the device (with a slightly reduced set of features). Fixes: 742d4d40831d ("s390/qeth: convert remaining legacy cmd callbacks") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index c64ef55f0dff..29facb913671 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -5026,10 +5026,8 @@ int qeth_core_hardsetup_card(struct qeth_card *card, bool *carrier_ok) } if (qeth_adp_supported(card, IPA_SETADP_SET_DIAG_ASSIST)) { rc = qeth_query_setdiagass(card); - if (rc < 0) { + if (rc) QETH_CARD_TEXT_(card, 2, "8err%d", rc); - goto out; - } } if (!qeth_is_diagass_supported(card, QETH_DIAGS_CMD_TRAP) || From 8385d756e114f2df8568e508902d5f9850817ffb Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Fri, 13 Dec 2019 09:04:08 +0100 Subject: [PATCH 081/118] libata: Fix retrieving of active qcs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ata_qc_complete_multiple() is called with a mask of the still active tags. mv_sata doesn't have this information directly and instead calculates the still active tags from the started tags (ap->qc_active) and the finished tags as (ap->qc_active ^ done_mask) Since 28361c40368 the hw_tag and tag are no longer the same and the equation is no longer valid. In ata_exec_internal_sg() ap->qc_active is initialized as 1ULL << ATA_TAG_INTERNAL, but in hardware tag 0 is started and this will be in done_mask on completion. ap->qc_active ^ done_mask becomes 0x100000000 ^ 0x1 = 0x100000001 and thus tag 0 used as the internal tag will never be reported as completed. This is fixed by introducing ata_qc_get_active() which returns the active hardware tags and calling it where appropriate. This is tested on mv_sata, but sata_fsl and sata_nv suffer from the same problem. There is another case in sata_nv that most likely needs fixing as well, but this looks a little different, so I wasn't confident enough to change that. Fixes: 28361c403683 ("libata: add extra internal command") Cc: stable@vger.kernel.org Tested-by: Pali Rohár Signed-off-by: Sascha Hauer Add missing export of ata_qc_get_active(), as per Pali. Signed-off-by: Jens Axboe --- drivers/ata/libata-core.c | 24 ++++++++++++++++++++++++ drivers/ata/sata_fsl.c | 2 +- drivers/ata/sata_mv.c | 2 +- drivers/ata/sata_nv.c | 2 +- include/linux/libata.h | 1 + 5 files changed, 28 insertions(+), 3 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index e9017c570bc5..6f4ab5c5b52d 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5328,6 +5328,30 @@ void ata_qc_complete(struct ata_queued_cmd *qc) } } +/** + * ata_qc_get_active - get bitmask of active qcs + * @ap: port in question + * + * LOCKING: + * spin_lock_irqsave(host lock) + * + * RETURNS: + * Bitmask of active qcs + */ +u64 ata_qc_get_active(struct ata_port *ap) +{ + u64 qc_active = ap->qc_active; + + /* ATA_TAG_INTERNAL is sent to hw as tag 0 */ + if (qc_active & (1ULL << ATA_TAG_INTERNAL)) { + qc_active |= (1 << 0); + qc_active &= ~(1ULL << ATA_TAG_INTERNAL); + } + + return qc_active; +} +EXPORT_SYMBOL_GPL(ata_qc_get_active); + /** * ata_qc_complete_multiple - Complete multiple qcs successfully * @ap: port in question diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c index 9239615d8a04..d55ee244d693 100644 --- a/drivers/ata/sata_fsl.c +++ b/drivers/ata/sata_fsl.c @@ -1280,7 +1280,7 @@ static void sata_fsl_host_intr(struct ata_port *ap) i, ioread32(hcr_base + CC), ioread32(hcr_base + CA)); } - ata_qc_complete_multiple(ap, ap->qc_active ^ done_mask); + ata_qc_complete_multiple(ap, ata_qc_get_active(ap) ^ done_mask); return; } else if ((ap->qc_active & (1ULL << ATA_TAG_INTERNAL))) { diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c index 277f11909fc1..d7228f8e9297 100644 --- a/drivers/ata/sata_mv.c +++ b/drivers/ata/sata_mv.c @@ -2829,7 +2829,7 @@ static void mv_process_crpb_entries(struct ata_port *ap, struct mv_port_priv *pp } if (work_done) { - ata_qc_complete_multiple(ap, ap->qc_active ^ done_mask); + ata_qc_complete_multiple(ap, ata_qc_get_active(ap) ^ done_mask); /* Update the software queue position index in hardware */ writelfl((pp->crpb_dma & EDMA_RSP_Q_BASE_LO_MASK) | diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c index f3e62f5528bd..eb9dc14e5147 100644 --- a/drivers/ata/sata_nv.c +++ b/drivers/ata/sata_nv.c @@ -984,7 +984,7 @@ static irqreturn_t nv_adma_interrupt(int irq, void *dev_instance) check_commands = 0; check_commands &= ~(1 << pos); } - ata_qc_complete_multiple(ap, ap->qc_active ^ done_mask); + ata_qc_complete_multiple(ap, ata_qc_get_active(ap) ^ done_mask); } } diff --git a/include/linux/libata.h b/include/linux/libata.h index d3bbfddf616a..2dbde119721d 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1175,6 +1175,7 @@ extern unsigned int ata_do_dev_read_id(struct ata_device *dev, struct ata_taskfile *tf, u16 *id); extern void ata_qc_complete(struct ata_queued_cmd *qc); extern int ata_qc_complete_multiple(struct ata_port *ap, u64 qc_active); +extern u64 ata_qc_get_active(struct ata_port *ap); extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd); extern int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev, From 57b948e2c9ce58e4fcb687bc285b5b5a2cc66640 Mon Sep 17 00:00:00 2001 From: Netanel Belgazal Date: Sun, 22 Dec 2019 09:47:59 +0000 Subject: [PATCH 082/118] MAINTAINERS: Add additional maintainers to ENA Ethernet driver Signed-off-by: Netanel Belgazal Signed-off-by: David S. Miller --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index ffa3371bc750..992474b930bf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -771,6 +771,8 @@ F: drivers/thermal/thermal_mmio.c AMAZON ETHERNET DRIVERS M: Netanel Belgazal +M: Arthur Kiyanovski +R: Guy Tzalik R: Saeed Bishara R: Zorik Machulsky L: netdev@vger.kernel.org From 84bb59d773853bc2dda2ac1ef8474c40eb33a3c6 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 22 Dec 2019 11:25:27 +0000 Subject: [PATCH 083/118] hsr: avoid debugfs warning message when module is remove When hsr module is being removed, debugfs_remove() is called to remove both debugfs directory and file. When module is being removed, module state is changed to MODULE_STATE_GOING then exit() is called. At this moment, module couldn't be held so try_module_get() will be failed. debugfs's open() callback tries to hold the module if .owner is existing. If it fails, warning message is printed. CPU0 CPU1 delete_module() try_stop_module() hsr_exit() open() <-- WARNING debugfs_remove() In order to avoid the warning message, this patch makes hsr module does not set .owner. Unsetting .owner is safe because these are protected by inode_lock(). Test commands: #SHELL1 ip link add dummy0 type dummy ip link add dummy1 type dummy while : do ip link add hsr0 type hsr slave1 dummy0 slave2 dummy1 modprobe -rv hsr done #SHELL2 while : do cat /sys/kernel/debug/hsr0/node_table done Splat looks like: [ 101.223783][ T1271] ------------[ cut here ]------------ [ 101.230309][ T1271] debugfs file owner did not clean up at exit: node_table [ 101.230380][ T1271] WARNING: CPU: 3 PID: 1271 at fs/debugfs/file.c:309 full_proxy_open+0x10f/0x650 [ 101.233153][ T1271] Modules linked in: hsr(-) dummy veth openvswitch nsh nf_conncount nf_nat nf_conntrack nf_d] [ 101.237112][ T1271] CPU: 3 PID: 1271 Comm: cat Tainted: G W 5.5.0-rc1+ #204 [ 101.238270][ T1271] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 101.240379][ T1271] RIP: 0010:full_proxy_open+0x10f/0x650 [ 101.241166][ T1271] Code: 48 c1 ea 03 80 3c 02 00 0f 85 c1 04 00 00 49 8b 3c 24 e8 04 86 7e ff 84 c0 75 2d 4c 8 [ 101.251985][ T1271] RSP: 0018:ffff8880ca22fa38 EFLAGS: 00010286 [ 101.273355][ T1271] RAX: dffffc0000000008 RBX: ffff8880cc6e6200 RCX: 0000000000000000 [ 101.274466][ T1271] RDX: 0000000000000000 RSI: 0000000000000006 RDI: ffff8880c4dd5c14 [ 101.275581][ T1271] RBP: 0000000000000000 R08: fffffbfff2922f5d R09: 0000000000000000 [ 101.276733][ T1271] R10: 0000000000000001 R11: 0000000000000000 R12: ffffffffc0551bc0 [ 101.277853][ T1271] R13: ffff8880c4059a48 R14: ffff8880be50a5e0 R15: ffffffff941adaa0 [ 101.278956][ T1271] FS: 00007f8871cda540(0000) GS:ffff8880da800000(0000) knlGS:0000000000000000 [ 101.280216][ T1271] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 101.282832][ T1271] CR2: 00007f88717cfd10 CR3: 00000000b9440005 CR4: 00000000000606e0 [ 101.283974][ T1271] Call Trace: [ 101.285328][ T1271] do_dentry_open+0x63c/0xf50 [ 101.286077][ T1271] ? open_proxy_open+0x270/0x270 [ 101.288271][ T1271] ? __x64_sys_fchdir+0x180/0x180 [ 101.288987][ T1271] ? inode_permission+0x65/0x390 [ 101.289682][ T1271] path_openat+0x701/0x2810 [ 101.290294][ T1271] ? path_lookupat+0x880/0x880 [ 101.290957][ T1271] ? check_chain_key+0x236/0x5d0 [ 101.291676][ T1271] ? __lock_acquire+0xdfe/0x3de0 [ 101.292358][ T1271] ? sched_clock+0x5/0x10 [ 101.292962][ T1271] ? sched_clock_cpu+0x18/0x170 [ 101.293644][ T1271] ? find_held_lock+0x39/0x1d0 [ 101.305616][ T1271] do_filp_open+0x17a/0x270 [ 101.306061][ T1271] ? may_open_dev+0xc0/0xc0 [ ... ] Fixes: fc4ecaeebd26 ("net: hsr: add debugfs support for display node list") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_debugfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 94447974a3c0..6135706f03d5 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -64,7 +64,6 @@ hsr_node_table_open(struct inode *inode, struct file *filp) } static const struct file_operations hsr_fops = { - .owner = THIS_MODULE, .open = hsr_node_table_open, .read = seq_read, .llseek = seq_lseek, From 1d19e2d53e8ed9e4c98fc95e0067492cda7288b0 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 22 Dec 2019 11:26:15 +0000 Subject: [PATCH 084/118] hsr: fix error handling routine in hsr_dev_finalize() hsr_dev_finalize() is called to create new hsr interface. There are some wrong error handling codes. 1. wrong checking return value of debugfs_create_{dir/file}. These function doesn't return NULL. If error occurs in there, it returns error pointer. So, it should check error pointer instead of NULL. 2. It doesn't unregister interface if it fails to setup hsr interface. If it fails to initialize hsr interface after register_netdevice(), it should call unregister_netdevice(). 3. Ignore failure of creation of debugfs If creating of debugfs dir and file is failed, creating hsr interface will be failed. But debugfs doesn't affect actual logic of hsr module. So, ignoring this is more correct and this behavior is more general. Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_debugfs.c | 15 +++++++-------- net/hsr/hsr_device.c | 19 ++++++++++--------- net/hsr/hsr_main.h | 11 ++++------- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 6135706f03d5..6618a9d8e58e 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -77,15 +77,14 @@ static const struct file_operations hsr_fops = { * When debugfs is configured this routine sets up the node_table file per * hsr device for dumping the node_table entries */ -int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) +void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) { - int rc = -1; struct dentry *de = NULL; de = debugfs_create_dir(hsr_dev->name, NULL); - if (!de) { + if (IS_ERR(de)) { pr_err("Cannot create hsr debugfs root\n"); - return rc; + return; } priv->node_tbl_root = de; @@ -93,13 +92,13 @@ int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) de = debugfs_create_file("node_table", S_IFREG | 0444, priv->node_tbl_root, priv, &hsr_fops); - if (!de) { + if (IS_ERR(de)) { pr_err("Cannot create hsr node_table directory\n"); - return rc; + debugfs_remove(priv->node_tbl_root); + priv->node_tbl_root = NULL; + return; } priv->node_tbl_file = de; - - return 0; } /* hsr_debugfs_term - Tear down debugfs intrastructure diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index b01e1bae4ddc..e73549075a03 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -477,30 +477,31 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER); if (res) - goto err_add_port; + goto err_add_master; res = register_netdevice(hsr_dev); if (res) - goto fail; + goto err_unregister; res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A); if (res) - goto fail; + goto err_add_slaves; + res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B); if (res) - goto fail; + goto err_add_slaves; + hsr_debugfs_init(hsr, hsr_dev); mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); - res = hsr_debugfs_init(hsr, hsr_dev); - if (res) - goto fail; return 0; -fail: +err_add_slaves: + unregister_netdevice(hsr_dev); +err_unregister: list_for_each_entry_safe(port, tmp, &hsr->ports, port_list) hsr_del_port(port); -err_add_port: +err_add_master: hsr_del_self_node(&hsr->self_node_db); return res; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 96fac696a1e1..acab9c353a49 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -184,15 +184,12 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) } #if IS_ENABLED(CONFIG_DEBUG_FS) -int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); +void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); void hsr_debugfs_term(struct hsr_priv *priv); #else -static inline int hsr_debugfs_init(struct hsr_priv *priv, - struct net_device *hsr_dev) -{ - return 0; -} - +static inline void hsr_debugfs_init(struct hsr_priv *priv, + struct net_device *hsr_dev) +{} static inline void hsr_debugfs_term(struct hsr_priv *priv) {} #endif From c6c4ccd7f96993e106dfea7ef18127f972f2db5e Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 22 Dec 2019 11:26:27 +0000 Subject: [PATCH 085/118] hsr: add hsr root debugfs directory In current hsr code, when hsr interface is created, it creates debugfs directory /sys/kernel/debug/. If there is same directory or file name in there, it fails. In order to reduce possibility of failure of creation of debugfs, this patch adds root directory. Test commands: ip link add dummy0 type dummy ip link add dummy1 type dummy ip link add hsr0 type hsr slave1 dummy0 slave2 dummy1 Before this patch: /sys/kernel/debug/hsr0/node_table After this patch: /sys/kernel/debug/hsr/hsr0/node_table Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_debugfs.c | 23 ++++++++++++++++++++--- net/hsr/hsr_main.c | 1 + net/hsr/hsr_main.h | 6 ++++++ net/hsr/hsr_netlink.c | 1 + 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 6618a9d8e58e..a7462a718e7b 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -20,6 +20,8 @@ #include "hsr_main.h" #include "hsr_framereg.h" +static struct dentry *hsr_debugfs_root_dir; + static void print_mac_address(struct seq_file *sfp, unsigned char *mac) { seq_printf(sfp, "%02x:%02x:%02x:%02x:%02x:%02x:", @@ -81,9 +83,9 @@ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) { struct dentry *de = NULL; - de = debugfs_create_dir(hsr_dev->name, NULL); + de = debugfs_create_dir(hsr_dev->name, hsr_debugfs_root_dir); if (IS_ERR(de)) { - pr_err("Cannot create hsr debugfs root\n"); + pr_err("Cannot create hsr debugfs directory\n"); return; } @@ -93,7 +95,7 @@ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) priv->node_tbl_root, priv, &hsr_fops); if (IS_ERR(de)) { - pr_err("Cannot create hsr node_table directory\n"); + pr_err("Cannot create hsr node_table file\n"); debugfs_remove(priv->node_tbl_root); priv->node_tbl_root = NULL; return; @@ -115,3 +117,18 @@ hsr_debugfs_term(struct hsr_priv *priv) debugfs_remove(priv->node_tbl_root); priv->node_tbl_root = NULL; } + +void hsr_debugfs_create_root(void) +{ + hsr_debugfs_root_dir = debugfs_create_dir("hsr", NULL); + if (IS_ERR(hsr_debugfs_root_dir)) { + pr_err("Cannot create hsr debugfs root directory\n"); + hsr_debugfs_root_dir = NULL; + } +} + +void hsr_debugfs_remove_root(void) +{ + /* debugfs_remove() internally checks NULL and ERROR */ + debugfs_remove(hsr_debugfs_root_dir); +} diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index b9988a662ee1..490896379073 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -123,6 +123,7 @@ static void __exit hsr_exit(void) { unregister_netdevice_notifier(&hsr_nb); hsr_netlink_exit(); + hsr_debugfs_remove_root(); } module_init(hsr_init); diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index acab9c353a49..55d2057bf749 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -186,12 +186,18 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) #if IS_ENABLED(CONFIG_DEBUG_FS) void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); void hsr_debugfs_term(struct hsr_priv *priv); +void hsr_debugfs_create_root(void); +void hsr_debugfs_remove_root(void); #else static inline void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) {} static inline void hsr_debugfs_term(struct hsr_priv *priv) {} +static inline void hsr_debugfs_create_root(void) +{} +static inline void hsr_debugfs_remove_root(void) +{} #endif #endif /* __HSR_PRIVATE_H */ diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 8f8337f893ba..8dc0547f01d0 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -476,6 +476,7 @@ int __init hsr_netlink_init(void) if (rc) goto fail_genl_register_family; + hsr_debugfs_create_root(); return 0; fail_genl_register_family: From 4c2d5e33dcd3a6333a7895be3b542ff3d373177c Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 22 Dec 2019 11:26:39 +0000 Subject: [PATCH 086/118] hsr: rename debugfs file when interface name is changed hsr interface has own debugfs file, which name is same with interface name. So, interface name is changed, debugfs file name should be changed too. Fixes: fc4ecaeebd26 ("net: hsr: add debugfs support for display node list") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_debugfs.c | 13 +++++++++++++ net/hsr/hsr_main.c | 3 +++ net/hsr/hsr_main.h | 4 ++++ 3 files changed, 20 insertions(+) diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index a7462a718e7b..d5f709b940ff 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -65,6 +65,19 @@ hsr_node_table_open(struct inode *inode, struct file *filp) return single_open(filp, hsr_node_table_show, inode->i_private); } +void hsr_debugfs_rename(struct net_device *dev) +{ + struct hsr_priv *priv = netdev_priv(dev); + struct dentry *d; + + d = debugfs_rename(hsr_debugfs_root_dir, priv->node_tbl_root, + hsr_debugfs_root_dir, dev->name); + if (IS_ERR(d)) + netdev_warn(dev, "failed to rename\n"); + else + priv->node_tbl_root = d; +} + static const struct file_operations hsr_fops = { .open = hsr_node_table_open, .read = seq_read, diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 490896379073..ea23eb7408e4 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -45,6 +45,9 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, case NETDEV_CHANGE: /* Link (carrier) state changes */ hsr_check_carrier_and_operstate(hsr); break; + case NETDEV_CHANGENAME: + hsr_debugfs_rename(dev); + break; case NETDEV_CHANGEADDR: if (port->type == HSR_PT_MASTER) { /* This should not happen since there's no diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 55d2057bf749..8d885bc6a54d 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -184,11 +184,15 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) } #if IS_ENABLED(CONFIG_DEBUG_FS) +void hsr_debugfs_rename(struct net_device *dev); void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); void hsr_debugfs_term(struct hsr_priv *priv); void hsr_debugfs_create_root(void); void hsr_debugfs_remove_root(void); #else +static inline void void hsr_debugfs_rename(struct net_device *dev) +{ +} static inline void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) {} From 92a35678ec075100ce666a2fb6969151affb0e5d Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 22 Dec 2019 11:26:54 +0000 Subject: [PATCH 087/118] hsr: fix a race condition in node list insertion and deletion hsr nodes are protected by RCU and there is no write side lock. But node insertions and deletions could be being operated concurrently. So write side locking is needed. Test commands: ip netns add nst ip link add veth0 type veth peer name veth1 ip link add veth2 type veth peer name veth3 ip link set veth1 netns nst ip link set veth3 netns nst ip link set veth0 up ip link set veth2 up ip link add hsr0 type hsr slave1 veth0 slave2 veth2 ip a a 192.168.100.1/24 dev hsr0 ip link set hsr0 up ip netns exec nst ip link set veth1 up ip netns exec nst ip link set veth3 up ip netns exec nst ip link add hsr1 type hsr slave1 veth1 slave2 veth3 ip netns exec nst ip a a 192.168.100.2/24 dev hsr1 ip netns exec nst ip link set hsr1 up for i in {0..9} do for j in {0..9} do for k in {0..9} do for l in {0..9} do arping 192.168.100.2 -I hsr0 -s 00:01:3$i:4$j:5$k:6$l -c1 & done done done done Splat looks like: [ 236.066091][ T3286] list_add corruption. next->prev should be prev (ffff8880a5940300), but was ffff8880a5940d0. [ 236.069617][ T3286] ------------[ cut here ]------------ [ 236.070545][ T3286] kernel BUG at lib/list_debug.c:25! [ 236.071391][ T3286] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI [ 236.072343][ T3286] CPU: 0 PID: 3286 Comm: arping Tainted: G W 5.5.0-rc1+ #209 [ 236.073463][ T3286] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 236.074695][ T3286] RIP: 0010:__list_add_valid+0x74/0xd0 [ 236.075499][ T3286] Code: 48 39 da 75 27 48 39 f5 74 36 48 39 dd 74 31 48 83 c4 08 b8 01 00 00 00 5b 5d c3 48 b [ 236.078277][ T3286] RSP: 0018:ffff8880aaa97648 EFLAGS: 00010286 [ 236.086991][ T3286] RAX: 0000000000000075 RBX: ffff8880d4624c20 RCX: 0000000000000000 [ 236.088000][ T3286] RDX: 0000000000000075 RSI: 0000000000000008 RDI: ffffed1015552ebf [ 236.098897][ T3286] RBP: ffff88809b53d200 R08: ffffed101b3c04f9 R09: ffffed101b3c04f9 [ 236.099960][ T3286] R10: 00000000308769a1 R11: ffffed101b3c04f8 R12: ffff8880d4624c28 [ 236.100974][ T3286] R13: ffff8880d4624c20 R14: 0000000040310100 R15: ffff8880ce17ee02 [ 236.138967][ T3286] FS: 00007f23479fa680(0000) GS:ffff8880d9c00000(0000) knlGS:0000000000000000 [ 236.144852][ T3286] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 236.145720][ T3286] CR2: 00007f4a14bab210 CR3: 00000000a61c6001 CR4: 00000000000606f0 [ 236.146776][ T3286] Call Trace: [ 236.147222][ T3286] hsr_add_node+0x314/0x490 [hsr] [ 236.153633][ T3286] hsr_forward_skb+0x2b6/0x1bc0 [hsr] [ 236.154362][ T3286] ? rcu_read_lock_sched_held+0x90/0xc0 [ 236.155091][ T3286] ? rcu_read_lock_bh_held+0xa0/0xa0 [ 236.156607][ T3286] hsr_dev_xmit+0x70/0xd0 [hsr] [ 236.157254][ T3286] dev_hard_start_xmit+0x160/0x740 [ 236.157941][ T3286] __dev_queue_xmit+0x1961/0x2e10 [ 236.158565][ T3286] ? netdev_core_pick_tx+0x2e0/0x2e0 [ ... ] Reported-by: syzbot+3924327f9ad5f4d2b343@syzkaller.appspotmail.com Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 7 ++-- net/hsr/hsr_framereg.c | 73 ++++++++++++++++++++++++++---------------- net/hsr/hsr_framereg.h | 6 ++-- net/hsr/hsr_main.c | 2 +- net/hsr/hsr_main.h | 5 +-- 5 files changed, 56 insertions(+), 37 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index e73549075a03..62c03f0d0079 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -368,7 +368,7 @@ static void hsr_dev_destroy(struct net_device *hsr_dev) del_timer_sync(&hsr->prune_timer); del_timer_sync(&hsr->announce_timer); - hsr_del_self_node(&hsr->self_node_db); + hsr_del_self_node(hsr); hsr_del_nodes(&hsr->node_db); } @@ -440,11 +440,12 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], INIT_LIST_HEAD(&hsr->ports); INIT_LIST_HEAD(&hsr->node_db); INIT_LIST_HEAD(&hsr->self_node_db); + spin_lock_init(&hsr->list_lock); ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr); /* Make sure we recognize frames from ourselves in hsr_rcv() */ - res = hsr_create_self_node(&hsr->self_node_db, hsr_dev->dev_addr, + res = hsr_create_self_node(hsr, hsr_dev->dev_addr, slave[1]->dev_addr); if (res < 0) return res; @@ -502,7 +503,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], list_for_each_entry_safe(port, tmp, &hsr->ports, port_list) hsr_del_port(port); err_add_master: - hsr_del_self_node(&hsr->self_node_db); + hsr_del_self_node(hsr); return res; } diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 292be446007b..27dc65d7de67 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -75,10 +75,11 @@ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, /* Helper for device init; the self_node_db is used in hsr_rcv() to recognize * frames from self that's been looped over the HSR ring. */ -int hsr_create_self_node(struct list_head *self_node_db, +int hsr_create_self_node(struct hsr_priv *hsr, unsigned char addr_a[ETH_ALEN], unsigned char addr_b[ETH_ALEN]) { + struct list_head *self_node_db = &hsr->self_node_db; struct hsr_node *node, *oldnode; node = kmalloc(sizeof(*node), GFP_KERNEL); @@ -88,33 +89,33 @@ int hsr_create_self_node(struct list_head *self_node_db, ether_addr_copy(node->macaddress_A, addr_a); ether_addr_copy(node->macaddress_B, addr_b); - rcu_read_lock(); + spin_lock_bh(&hsr->list_lock); oldnode = list_first_or_null_rcu(self_node_db, struct hsr_node, mac_list); if (oldnode) { list_replace_rcu(&oldnode->mac_list, &node->mac_list); - rcu_read_unlock(); - synchronize_rcu(); - kfree(oldnode); + spin_unlock_bh(&hsr->list_lock); + kfree_rcu(oldnode, rcu_head); } else { - rcu_read_unlock(); list_add_tail_rcu(&node->mac_list, self_node_db); + spin_unlock_bh(&hsr->list_lock); } return 0; } -void hsr_del_self_node(struct list_head *self_node_db) +void hsr_del_self_node(struct hsr_priv *hsr) { + struct list_head *self_node_db = &hsr->self_node_db; struct hsr_node *node; - rcu_read_lock(); + spin_lock_bh(&hsr->list_lock); node = list_first_or_null_rcu(self_node_db, struct hsr_node, mac_list); - rcu_read_unlock(); if (node) { list_del_rcu(&node->mac_list); - kfree(node); + kfree_rcu(node, rcu_head); } + spin_unlock_bh(&hsr->list_lock); } void hsr_del_nodes(struct list_head *node_db) @@ -130,30 +131,43 @@ void hsr_del_nodes(struct list_head *node_db) * seq_out is used to initialize filtering of outgoing duplicate frames * originating from the newly added node. */ -struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[], - u16 seq_out) +static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, + struct list_head *node_db, + unsigned char addr[], + u16 seq_out) { - struct hsr_node *node; + struct hsr_node *new_node, *node; unsigned long now; int i; - node = kzalloc(sizeof(*node), GFP_ATOMIC); - if (!node) + new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); + if (!new_node) return NULL; - ether_addr_copy(node->macaddress_A, addr); + ether_addr_copy(new_node->macaddress_A, addr); /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; for (i = 0; i < HSR_PT_PORTS; i++) - node->time_in[i] = now; + new_node->time_in[i] = now; for (i = 0; i < HSR_PT_PORTS; i++) - node->seq_out[i] = seq_out; - - list_add_tail_rcu(&node->mac_list, node_db); + new_node->seq_out[i] = seq_out; + spin_lock_bh(&hsr->list_lock); + list_for_each_entry_rcu(node, node_db, mac_list) { + if (ether_addr_equal(node->macaddress_A, addr)) + goto out; + if (ether_addr_equal(node->macaddress_B, addr)) + goto out; + } + list_add_tail_rcu(&new_node->mac_list, node_db); + spin_unlock_bh(&hsr->list_lock); + return new_node; +out: + spin_unlock_bh(&hsr->list_lock); + kfree(new_node); return node; } @@ -163,6 +177,7 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, bool is_sup) { struct list_head *node_db = &port->hsr->node_db; + struct hsr_priv *hsr = port->hsr; struct hsr_node *node; struct ethhdr *ethhdr; u16 seq_out; @@ -196,7 +211,7 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, seq_out = HSR_SEQNR_START; } - return hsr_add_node(node_db, ethhdr->h_source, seq_out); + return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out); } /* Use the Supervision frame's info about an eventual macaddress_B for merging @@ -206,10 +221,11 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, struct hsr_port *port_rcv) { - struct ethhdr *ethhdr; - struct hsr_node *node_real; + struct hsr_priv *hsr = port_rcv->hsr; struct hsr_sup_payload *hsr_sp; + struct hsr_node *node_real; struct list_head *node_db; + struct ethhdr *ethhdr; int i; ethhdr = (struct ethhdr *)skb_mac_header(skb); @@ -231,7 +247,7 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A); if (!node_real) /* No frame received from AddrA of this node yet */ - node_real = hsr_add_node(node_db, hsr_sp->macaddress_A, + node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, HSR_SEQNR_START - 1); if (!node_real) goto done; /* No mem */ @@ -252,7 +268,9 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, } node_real->addr_B_port = port_rcv->type; + spin_lock_bh(&hsr->list_lock); list_del_rcu(&node_curr->mac_list); + spin_unlock_bh(&hsr->list_lock); kfree_rcu(node_curr, rcu_head); done: @@ -368,12 +386,13 @@ void hsr_prune_nodes(struct timer_list *t) { struct hsr_priv *hsr = from_timer(hsr, t, prune_timer); struct hsr_node *node; + struct hsr_node *tmp; struct hsr_port *port; unsigned long timestamp; unsigned long time_a, time_b; - rcu_read_lock(); - list_for_each_entry_rcu(node, &hsr->node_db, mac_list) { + spin_lock_bh(&hsr->list_lock); + list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) { /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A] * nor time_in[HSR_PT_SLAVE_B], will ever be updated for * the master port. Thus the master node will be repeatedly @@ -421,7 +440,7 @@ void hsr_prune_nodes(struct timer_list *t) kfree_rcu(node, rcu_head); } } - rcu_read_unlock(); + spin_unlock_bh(&hsr->list_lock); /* Restart timer */ mod_timer(&hsr->prune_timer, diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 89a3ce38151d..0f0fa12b4329 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -12,10 +12,8 @@ struct hsr_node; -void hsr_del_self_node(struct list_head *self_node_db); +void hsr_del_self_node(struct hsr_priv *hsr); void hsr_del_nodes(struct list_head *node_db); -struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[], - u16 seq_out); struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, bool is_sup); void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, @@ -33,7 +31,7 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, void hsr_prune_nodes(struct timer_list *t); -int hsr_create_self_node(struct list_head *self_node_db, +int hsr_create_self_node(struct hsr_priv *hsr, unsigned char addr_a[ETH_ALEN], unsigned char addr_b[ETH_ALEN]); diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index ea23eb7408e4..d2ee7125a7f1 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -67,7 +67,7 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, /* Make sure we recognize frames from ourselves in hsr_rcv() */ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); - res = hsr_create_self_node(&hsr->self_node_db, + res = hsr_create_self_node(hsr, master->dev->dev_addr, port ? port->dev->dev_addr : diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 8d885bc6a54d..d40de84a637f 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -160,8 +160,9 @@ struct hsr_priv { int announce_count; u16 sequence_nr; u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ - u8 prot_version; /* Indicate if HSRv0 or HSRv1. */ - spinlock_t seqnr_lock; /* locking for sequence_nr */ + u8 prot_version; /* Indicate if HSRv0 or HSRv1. */ + spinlock_t seqnr_lock; /* locking for sequence_nr */ + spinlock_t list_lock; /* locking for node list */ unsigned char sup_multicast_addr[ETH_ALEN]; #ifdef CONFIG_DEBUG_FS struct dentry *node_tbl_root; From 3ed0a1d563903bdb4b4c36c58c4d9c1bcb23a6e6 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 22 Dec 2019 11:27:08 +0000 Subject: [PATCH 088/118] hsr: reset network header when supervision frame is created The supervision frame is L2 frame. When supervision frame is created, hsr module doesn't set network header. If tap routine is enabled, dev_queue_xmit_nit() is called and it checks network_header. If network_header pointer wasn't set(or invalid), it resets network_header and warns. In order to avoid unnecessary warning message, resetting network_header is needed. Test commands: ip netns add nst ip link add veth0 type veth peer name veth1 ip link add veth2 type veth peer name veth3 ip link set veth1 netns nst ip link set veth3 netns nst ip link set veth0 up ip link set veth2 up ip link add hsr0 type hsr slave1 veth0 slave2 veth2 ip a a 192.168.100.1/24 dev hsr0 ip link set hsr0 up ip netns exec nst ip link set veth1 up ip netns exec nst ip link set veth3 up ip netns exec nst ip link add hsr1 type hsr slave1 veth1 slave2 veth3 ip netns exec nst ip a a 192.168.100.2/24 dev hsr1 ip netns exec nst ip link set hsr1 up tcpdump -nei veth0 Splat looks like: [ 175.852292][ C3] protocol 88fb is buggy, dev veth0 Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 62c03f0d0079..c7bd6c49fadf 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -272,6 +272,8 @@ static void send_hsr_supervision_frame(struct hsr_port *master, skb->dev->dev_addr, skb->len) <= 0) goto out; skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); if (hsr_ver > 0) { hsr_tag = skb_put(skb, sizeof(struct hsr_tag)); From 84b032dbfdf1c139cd2b864e43959510646975f8 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 10 Dec 2019 10:53:44 -0800 Subject: [PATCH 089/118] ata: libahci_platform: Export again ahci_platform_able_phys() This reverts commit 6bb86fefa086faba7b60bb452300b76a47cde1a5 ("libahci_platform: Staticize ahci_platform_able_phys()") we are going to need ahci_platform_{enable,disable}_phys() in a subsequent commit for ahci_brcm.c in order to properly control the PHY initialization order. Also make sure the function prototypes are declared in include/linux/ahci_platform.h as a result. Cc: stable@vger.kernel.org Reviewed-by: Hans de Goede Signed-off-by: Florian Fainelli Signed-off-by: Jens Axboe --- drivers/ata/libahci_platform.c | 6 ++++-- include/linux/ahci_platform.h | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c index 8befce036af8..129556fcf6be 100644 --- a/drivers/ata/libahci_platform.c +++ b/drivers/ata/libahci_platform.c @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(ahci_platform_ops); * RETURNS: * 0 on success otherwise a negative error code */ -static int ahci_platform_enable_phys(struct ahci_host_priv *hpriv) +int ahci_platform_enable_phys(struct ahci_host_priv *hpriv) { int rc, i; @@ -74,6 +74,7 @@ static int ahci_platform_enable_phys(struct ahci_host_priv *hpriv) } return rc; } +EXPORT_SYMBOL_GPL(ahci_platform_enable_phys); /** * ahci_platform_disable_phys - Disable PHYs @@ -81,7 +82,7 @@ static int ahci_platform_enable_phys(struct ahci_host_priv *hpriv) * * This function disables all PHYs found in hpriv->phys. */ -static void ahci_platform_disable_phys(struct ahci_host_priv *hpriv) +void ahci_platform_disable_phys(struct ahci_host_priv *hpriv) { int i; @@ -90,6 +91,7 @@ static void ahci_platform_disable_phys(struct ahci_host_priv *hpriv) phy_exit(hpriv->phys[i]); } } +EXPORT_SYMBOL_GPL(ahci_platform_disable_phys); /** * ahci_platform_enable_clks - Enable platform clocks diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index 6782f0d45ebe..49e5383d4222 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -19,6 +19,8 @@ struct ahci_host_priv; struct platform_device; struct scsi_host_template; +int ahci_platform_enable_phys(struct ahci_host_priv *hpriv); +void ahci_platform_disable_phys(struct ahci_host_priv *hpriv); int ahci_platform_enable_clks(struct ahci_host_priv *hpriv); void ahci_platform_disable_clks(struct ahci_host_priv *hpriv); int ahci_platform_enable_regulators(struct ahci_host_priv *hpriv); From c0cdf2ac4b5bf3e5ef2451ea29fb4104278cdabc Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 10 Dec 2019 10:53:45 -0800 Subject: [PATCH 090/118] ata: ahci_brcm: Fix AHCI resources management The AHCI resources management within ahci_brcm.c is a little convoluted, largely because it historically had a dedicated clock that was managed within this file in the downstream tree. Once brough upstream though, the clock was left to be managed by libahci_platform.c which is entirely appropriate. This patch series ensures that the AHCI resources are fetched and enabled before any register access is done, thus avoiding bus errors on platforms which clock gate the controller by default. As a result we need to re-arrange the suspend() and resume() functions in order to avoid accessing registers after the clocks have been turned off respectively before the clocks have been turned on. Finally, we can refactor brcm_ahci_get_portmask() in order to fetch the number of ports from hpriv->mmio which is now accessible without jumping through hoops like we used to do. The commit pointed in the Fixes tag is both old and new enough not to require major headaches for backporting of this patch. Fixes: eba68f829794 ("ata: ahci_brcmstb: rename to support across Broadcom SoC's") Cc: stable@vger.kernel.org Reviewed-by: Hans de Goede Signed-off-by: Florian Fainelli Signed-off-by: Jens Axboe --- drivers/ata/ahci_brcm.c | 105 +++++++++++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 29 deletions(-) diff --git a/drivers/ata/ahci_brcm.c b/drivers/ata/ahci_brcm.c index f41744b9b38a..a8b2f3f7bbbc 100644 --- a/drivers/ata/ahci_brcm.c +++ b/drivers/ata/ahci_brcm.c @@ -213,19 +213,12 @@ static void brcm_sata_phys_disable(struct brcm_ahci_priv *priv) brcm_sata_phy_disable(priv, i); } -static u32 brcm_ahci_get_portmask(struct platform_device *pdev, +static u32 brcm_ahci_get_portmask(struct ahci_host_priv *hpriv, struct brcm_ahci_priv *priv) { - void __iomem *ahci; - struct resource *res; u32 impl; - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ahci"); - ahci = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(ahci)) - return 0; - - impl = readl(ahci + HOST_PORTS_IMPL); + impl = readl(hpriv->mmio + HOST_PORTS_IMPL); if (fls(impl) > SATA_TOP_MAX_PHYS) dev_warn(priv->dev, "warning: more ports than PHYs (%#x)\n", @@ -233,9 +226,6 @@ static u32 brcm_ahci_get_portmask(struct platform_device *pdev, else if (!impl) dev_info(priv->dev, "no ports found\n"); - devm_iounmap(&pdev->dev, ahci); - devm_release_mem_region(&pdev->dev, res->start, resource_size(res)); - return impl; } @@ -347,11 +337,10 @@ static int brcm_ahci_suspend(struct device *dev) struct ata_host *host = dev_get_drvdata(dev); struct ahci_host_priv *hpriv = host->private_data; struct brcm_ahci_priv *priv = hpriv->plat_data; - int ret; - ret = ahci_platform_suspend(dev); brcm_sata_phys_disable(priv); - return ret; + + return ahci_platform_suspend(dev); } static int brcm_ahci_resume(struct device *dev) @@ -359,11 +348,44 @@ static int brcm_ahci_resume(struct device *dev) struct ata_host *host = dev_get_drvdata(dev); struct ahci_host_priv *hpriv = host->private_data; struct brcm_ahci_priv *priv = hpriv->plat_data; + int ret; + + /* Make sure clocks are turned on before re-configuration */ + ret = ahci_platform_enable_clks(hpriv); + if (ret) + return ret; brcm_sata_init(priv); brcm_sata_phys_enable(priv); brcm_sata_alpm_init(hpriv); - return ahci_platform_resume(dev); + + /* Since we had to enable clocks earlier on, we cannot use + * ahci_platform_resume() as-is since a second call to + * ahci_platform_enable_resources() would bump up the resources + * (regulators, clocks, PHYs) count artificially so we copy the part + * after ahci_platform_enable_resources(). + */ + ret = ahci_platform_enable_phys(hpriv); + if (ret) + goto out_disable_phys; + + ret = ahci_platform_resume_host(dev); + if (ret) + goto out_disable_platform_phys; + + /* We resumed so update PM runtime state */ + pm_runtime_disable(dev); + pm_runtime_set_active(dev); + pm_runtime_enable(dev); + + return 0; + +out_disable_platform_phys: + ahci_platform_disable_phys(hpriv); +out_disable_phys: + brcm_sata_phys_disable(priv); + ahci_platform_disable_clks(hpriv); + return ret; } #endif @@ -416,38 +438,63 @@ static int brcm_ahci_probe(struct platform_device *pdev) priv->quirks |= BRCM_AHCI_QUIRK_SKIP_PHY_ENABLE; } + hpriv = ahci_platform_get_resources(pdev, 0); + if (IS_ERR(hpriv)) { + ret = PTR_ERR(hpriv); + goto out_reset; + } + + ret = ahci_platform_enable_clks(hpriv); + if (ret) + goto out_reset; + + /* Must be first so as to configure endianness including that + * of the standard AHCI register space. + */ brcm_sata_init(priv); - priv->port_mask = brcm_ahci_get_portmask(pdev, priv); - if (!priv->port_mask) - return -ENODEV; + /* Initializes priv->port_mask which is used below */ + priv->port_mask = brcm_ahci_get_portmask(hpriv, priv); + if (!priv->port_mask) { + ret = -ENODEV; + goto out_disable_clks; + } + /* Must be done before ahci_platform_enable_phys() */ brcm_sata_phys_enable(priv); - hpriv = ahci_platform_get_resources(pdev, 0); - if (IS_ERR(hpriv)) - return PTR_ERR(hpriv); hpriv->plat_data = priv; hpriv->flags = AHCI_HFLAG_WAKE_BEFORE_STOP; brcm_sata_alpm_init(hpriv); - ret = ahci_platform_enable_resources(hpriv); - if (ret) - return ret; - if (priv->quirks & BRCM_AHCI_QUIRK_NO_NCQ) hpriv->flags |= AHCI_HFLAG_NO_NCQ; hpriv->flags |= AHCI_HFLAG_NO_WRITE_TO_RO; + ret = ahci_platform_enable_phys(hpriv); + if (ret) + goto out_disable_phys; + ret = ahci_platform_init_host(pdev, hpriv, &ahci_brcm_port_info, &ahci_platform_sht); if (ret) - return ret; + goto out_disable_platform_phys; dev_info(dev, "Broadcom AHCI SATA3 registered\n"); return 0; + +out_disable_platform_phys: + ahci_platform_disable_phys(hpriv); +out_disable_phys: + brcm_sata_phys_disable(priv); +out_disable_clks: + ahci_platform_disable_clks(hpriv); +out_reset: + if (!IS_ERR_OR_NULL(priv->rcdev)) + reset_control_assert(priv->rcdev); + return ret; } static int brcm_ahci_remove(struct platform_device *pdev) @@ -457,12 +504,12 @@ static int brcm_ahci_remove(struct platform_device *pdev) struct brcm_ahci_priv *priv = hpriv->plat_data; int ret; + brcm_sata_phys_disable(priv); + ret = ata_platform_remove_one(pdev); if (ret) return ret; - brcm_sata_phys_disable(priv); - return 0; } From 1a3d78cb6e20779a19388315bd8efefbd8d4a656 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 10 Dec 2019 10:53:46 -0800 Subject: [PATCH 091/118] ata: ahci_brcm: BCM7425 AHCI requires AHCI_HFLAG_DELAY_ENGINE Set AHCI_HFLAG_DELAY_ENGINE for the BCM7425 AHCI controller thus making it conforming to the 'strict' AHCI implementation which this controller is based on. This solves long link establishment with specific hard drives (e.g.: Seagate ST1000VM002-9ZL1 SC12) that would otherwise have to complete the error recovery handling before finally establishing a succesful SATA link at the desired speed. We re-order the hpriv->flags assignment to also remove the NONCQ quirk since we can set the flag directly. Fixes: 9586114cf1e9 ("ata: ahci_brcmstb: add support MIPS-based platforms") Fixes: 423be77daabe ("ata: ahci_brcmstb: add quirk for broken ncq") Cc: stable@vger.kernel.org Reviewed-by: Hans de Goede Signed-off-by: Florian Fainelli Signed-off-by: Jens Axboe --- drivers/ata/ahci_brcm.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/ata/ahci_brcm.c b/drivers/ata/ahci_brcm.c index a8b2f3f7bbbc..58f8fd7bb8b8 100644 --- a/drivers/ata/ahci_brcm.c +++ b/drivers/ata/ahci_brcm.c @@ -76,8 +76,7 @@ enum brcm_ahci_version { }; enum brcm_ahci_quirks { - BRCM_AHCI_QUIRK_NO_NCQ = BIT(0), - BRCM_AHCI_QUIRK_SKIP_PHY_ENABLE = BIT(1), + BRCM_AHCI_QUIRK_SKIP_PHY_ENABLE = BIT(0), }; struct brcm_ahci_priv { @@ -432,18 +431,27 @@ static int brcm_ahci_probe(struct platform_device *pdev) if (!IS_ERR_OR_NULL(priv->rcdev)) reset_control_deassert(priv->rcdev); - if ((priv->version == BRCM_SATA_BCM7425) || - (priv->version == BRCM_SATA_NSP)) { - priv->quirks |= BRCM_AHCI_QUIRK_NO_NCQ; - priv->quirks |= BRCM_AHCI_QUIRK_SKIP_PHY_ENABLE; - } - hpriv = ahci_platform_get_resources(pdev, 0); if (IS_ERR(hpriv)) { ret = PTR_ERR(hpriv); goto out_reset; } + hpriv->plat_data = priv; + hpriv->flags = AHCI_HFLAG_WAKE_BEFORE_STOP | AHCI_HFLAG_NO_WRITE_TO_RO; + + switch (priv->version) { + case BRCM_SATA_BCM7425: + hpriv->flags |= AHCI_HFLAG_DELAY_ENGINE; + /* fall through */ + case BRCM_SATA_NSP: + hpriv->flags |= AHCI_HFLAG_NO_NCQ; + priv->quirks |= BRCM_AHCI_QUIRK_SKIP_PHY_ENABLE; + break; + default: + break; + } + ret = ahci_platform_enable_clks(hpriv); if (ret) goto out_reset; @@ -463,15 +471,8 @@ static int brcm_ahci_probe(struct platform_device *pdev) /* Must be done before ahci_platform_enable_phys() */ brcm_sata_phys_enable(priv); - hpriv->plat_data = priv; - hpriv->flags = AHCI_HFLAG_WAKE_BEFORE_STOP; - brcm_sata_alpm_init(hpriv); - if (priv->quirks & BRCM_AHCI_QUIRK_NO_NCQ) - hpriv->flags |= AHCI_HFLAG_NO_NCQ; - hpriv->flags |= AHCI_HFLAG_NO_WRITE_TO_RO; - ret = ahci_platform_enable_phys(hpriv); if (ret) goto out_disable_phys; From bf0e5013bc2dcac205417e1252205dca39dfc005 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 10 Dec 2019 10:53:47 -0800 Subject: [PATCH 092/118] ata: ahci_brcm: Add missing clock management during recovery The downstream implementation of ahci_brcm.c did contain clock management recovery, but until recently, did that outside of the libahci_platform helpers and this was unintentionally stripped out while forward porting the patch upstream. Add the missing clock management during recovery and sleep for 10 milliseconds per the design team recommendations to ensure the SATA PHY controller and AFE have been fully quiesced. Fixes: eb73390ae241 ("ata: ahci_brcm: Recover from failures to identify devices") Cc: stable@vger.kernel.org Reviewed-by: Hans de Goede Signed-off-by: Florian Fainelli Signed-off-by: Jens Axboe --- drivers/ata/ahci_brcm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/ata/ahci_brcm.c b/drivers/ata/ahci_brcm.c index 58f8fd7bb8b8..66a570d0da83 100644 --- a/drivers/ata/ahci_brcm.c +++ b/drivers/ata/ahci_brcm.c @@ -274,6 +274,13 @@ static unsigned int brcm_ahci_read_id(struct ata_device *dev, /* Perform the SATA PHY reset sequence */ brcm_sata_phy_disable(priv, ap->port_no); + /* Reset the SATA clock */ + ahci_platform_disable_clks(hpriv); + msleep(10); + + ahci_platform_enable_clks(hpriv); + msleep(10); + /* Bring the PHY back on */ brcm_sata_phy_enable(priv, ap->port_no); From a5bcd72e054aabb93ddc51ed8cde36a5bfc50271 Mon Sep 17 00:00:00 2001 From: Vladyslav Tarasiuk Date: Thu, 26 Dec 2019 10:41:56 +0200 Subject: [PATCH 093/118] net/mlxfw: Fix out-of-memory error in mfa2 flash burning The burning process requires to perform internal allocations of large chunks of memory. This memory doesn't need to be contiguous and can be safely allocated by vzalloc() instead of kzalloc(). This patch changes such allocation to avoid possible out-of-memory failure. Fixes: 410ed13cae39 ("Add the mlxfw module for Mellanox firmware flash process") Signed-off-by: Vladyslav Tarasiuk Reviewed-by: Aya Levin Signed-off-by: Leon Romanovsky Tested-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c index 544344ac4894..79057af4fe99 100644 --- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c +++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "mlxfw_mfa2.h" #include "mlxfw_mfa2_file.h" @@ -548,7 +549,7 @@ mlxfw_mfa2_file_component_get(const struct mlxfw_mfa2_file *mfa2_file, comp_size = be32_to_cpu(comp->size); comp_buf_size = comp_size + mlxfw_mfa2_comp_magic_len; - comp_data = kmalloc(sizeof(*comp_data) + comp_buf_size, GFP_KERNEL); + comp_data = vzalloc(sizeof(*comp_data) + comp_buf_size); if (!comp_data) return ERR_PTR(-ENOMEM); comp_data->comp.data_size = comp_size; @@ -570,7 +571,7 @@ mlxfw_mfa2_file_component_get(const struct mlxfw_mfa2_file *mfa2_file, comp_data->comp.data = comp_data->buff + mlxfw_mfa2_comp_magic_len; return &comp_data->comp; err_out: - kfree(comp_data); + vfree(comp_data); return ERR_PTR(err); } @@ -579,7 +580,7 @@ void mlxfw_mfa2_file_component_put(struct mlxfw_mfa2_component *comp) const struct mlxfw_mfa2_comp_data *comp_data; comp_data = container_of(comp, struct mlxfw_mfa2_comp_data, comp); - kfree(comp_data); + vfree(comp_data); } void mlxfw_mfa2_file_fini(struct mlxfw_mfa2_file *mfa2_file) From c27569fcd6e1b11bd24361346504f2995a256e4e Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Mon, 23 Dec 2019 09:39:22 +0200 Subject: [PATCH 094/118] dpaa_eth: fix DMA mapping leak On the error path some fragments remain DMA mapped. Adding a fix that unmaps all the fragments. Rework cleanup path to be simpler. Fixes: 8151ee88bad5 ("dpaa_eth: use page backed rx buffers") Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- .../net/ethernet/freescale/dpaa/dpaa_eth.c | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 6a9d12dad5d9..a301f0095223 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -1719,7 +1719,7 @@ static struct sk_buff *sg_fd_to_skb(const struct dpaa_priv *priv, int page_offset; unsigned int sz; int *count_ptr; - int i; + int i, j; vaddr = phys_to_virt(addr); WARN_ON(!IS_ALIGNED((unsigned long)vaddr, SMP_CACHE_BYTES)); @@ -1736,14 +1736,14 @@ static struct sk_buff *sg_fd_to_skb(const struct dpaa_priv *priv, WARN_ON(!IS_ALIGNED((unsigned long)sg_vaddr, SMP_CACHE_BYTES)); + dma_unmap_page(priv->rx_dma_dev, sg_addr, + DPAA_BP_RAW_SIZE, DMA_FROM_DEVICE); + /* We may use multiple Rx pools */ dpaa_bp = dpaa_bpid2pool(sgt[i].bpid); if (!dpaa_bp) goto free_buffers; - count_ptr = this_cpu_ptr(dpaa_bp->percpu_count); - dma_unmap_page(priv->rx_dma_dev, sg_addr, - DPAA_BP_RAW_SIZE, DMA_FROM_DEVICE); if (!skb) { sz = dpaa_bp->size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); @@ -1786,7 +1786,9 @@ static struct sk_buff *sg_fd_to_skb(const struct dpaa_priv *priv, skb_add_rx_frag(skb, i - 1, head_page, frag_off, frag_len, dpaa_bp->size); } + /* Update the pool count for the current {cpu x bpool} */ + count_ptr = this_cpu_ptr(dpaa_bp->percpu_count); (*count_ptr)--; if (qm_sg_entry_is_final(&sgt[i])) @@ -1800,26 +1802,25 @@ static struct sk_buff *sg_fd_to_skb(const struct dpaa_priv *priv, return skb; free_buffers: - /* compensate sw bpool counter changes */ - for (i--; i >= 0; i--) { - dpaa_bp = dpaa_bpid2pool(sgt[i].bpid); - if (dpaa_bp) { - count_ptr = this_cpu_ptr(dpaa_bp->percpu_count); - (*count_ptr)++; - } - } /* free all the SG entries */ - for (i = 0; i < DPAA_SGT_MAX_ENTRIES ; i++) { - sg_addr = qm_sg_addr(&sgt[i]); + for (j = 0; j < DPAA_SGT_MAX_ENTRIES ; j++) { + sg_addr = qm_sg_addr(&sgt[j]); sg_vaddr = phys_to_virt(sg_addr); + /* all pages 0..i were unmaped */ + if (j > i) + dma_unmap_page(priv->rx_dma_dev, qm_sg_addr(&sgt[j]), + DPAA_BP_RAW_SIZE, DMA_FROM_DEVICE); free_pages((unsigned long)sg_vaddr, 0); - dpaa_bp = dpaa_bpid2pool(sgt[i].bpid); - if (dpaa_bp) { - count_ptr = this_cpu_ptr(dpaa_bp->percpu_count); - (*count_ptr)--; + /* counters 0..i-1 were decremented */ + if (j >= i) { + dpaa_bp = dpaa_bpid2pool(sgt[j].bpid); + if (dpaa_bp) { + count_ptr = this_cpu_ptr(dpaa_bp->percpu_count); + (*count_ptr)--; + } } - if (qm_sg_entry_is_final(&sgt[i])) + if (qm_sg_entry_is_final(&sgt[j])) break; } /* free the SGT fragment */ From 1c93fb45761e79b3c00080e71523886cefaf351c Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Mon, 23 Dec 2019 10:06:10 +0200 Subject: [PATCH 095/118] net: phy: aquantia: add suspend / resume ops for AQR105 The suspend/resume code for AQR107 works on AQR105 too. This patch fixes issues with the partner not seeing the link down when the interface using AQR105 is brought down. Fixes: bee8259dd31f ("net: phy: add driver for aquantia phy") Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- drivers/net/phy/aquantia_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/aquantia_main.c b/drivers/net/phy/aquantia_main.c index 3b29d381116f..975789d9349d 100644 --- a/drivers/net/phy/aquantia_main.c +++ b/drivers/net/phy/aquantia_main.c @@ -627,6 +627,8 @@ static struct phy_driver aqr_driver[] = { .config_intr = aqr_config_intr, .ack_interrupt = aqr_ack_interrupt, .read_status = aqr_read_status, + .suspend = aqr107_suspend, + .resume = aqr107_resume, }, { PHY_ID_MATCH_MODEL(PHY_ID_AQR106), From 7df2281a174bd0fdbb2211a26914e5440740fcde Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 23 Dec 2019 11:03:21 +0100 Subject: [PATCH 096/118] of: mdio: Add missing inline to of_mdiobus_child_is_phy() dummy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_OF_MDIO=n: drivers/net/phy/mdio_bus.c:23: include/linux/of_mdio.h:58:13: warning: ‘of_mdiobus_child_is_phy’ defined but not used [-Wunused-function] static bool of_mdiobus_child_is_phy(struct device_node *child) ^~~~~~~~~~~~~~~~~~~~~~~ Fix this by adding the missing "inline" keyword. Fixes: 0aa4d016c043d16a ("of: mdio: export of_mdiobus_child_is_phy") Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Acked-by: Borislav Petkov Signed-off-by: David S. Miller --- include/linux/of_mdio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index 79bc82e30c02..491a2b7e77c1 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -55,7 +55,7 @@ static inline int of_mdio_parse_addr(struct device *dev, } #else /* CONFIG_OF_MDIO */ -static bool of_mdiobus_child_is_phy(struct device_node *child) +static inline bool of_mdiobus_child_is_phy(struct device_node *child) { return false; } From 0444716a5dd563526e53ae686115987d5d4c249e Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Mon, 23 Dec 2019 10:23:08 -0800 Subject: [PATCH 097/118] bnx2x: Use appropriate define for vlan credit Although it has same value as MAX_MAC_CREDIT_E2, use MAX_VLAN_CREDIT_E2 appropriately. Signed-off-by: Manish Chopra Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h index 7a6e82db4231..ed237854939a 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h @@ -1537,7 +1537,7 @@ void bnx2x_get_rss_ind_table(struct bnx2x_rss_config_obj *rss_obj, func_num + GET_NUM_VFS_PER_PF(bp) * VF_MAC_CREDIT_CNT) #define PF_VLAN_CREDIT_E2(bp, func_num) \ - ((MAX_MAC_CREDIT_E2 - GET_NUM_VFS_PER_PATH(bp) * VF_VLAN_CREDIT_CNT) / \ + ((MAX_VLAN_CREDIT_E2 - GET_NUM_VFS_PER_PATH(bp) * VF_VLAN_CREDIT_CNT) /\ func_num + GET_NUM_VFS_PER_PF(bp) * VF_VLAN_CREDIT_CNT) #endif /* BNX2X_SP_VERBS */ From 5cdc40c7820ff66c2271e0884bd8ee8f7cfd769b Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Mon, 23 Dec 2019 10:23:09 -0800 Subject: [PATCH 098/118] bnx2x: Fix accounting of vlan resources among the PFs While testing max vlan configuration on the PF, firmware gets assert as driver was configuring number of vlans more than what is supported per port/engine, it was figured out that there is an implicit vlan (hidden default vlan consuming hardware cam entry resource) which is configured default for all the clients (PF/VFs) on client_init ramrod by the adapter implicitly, so when allocating resources among the PFs this implicit vlan should be considered or total vlan entries should be reduced by one to accommodate that default/implicit vlan entry. Signed-off-by: Manish Chopra Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h index ed237854939a..bacc8552bce1 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h @@ -1536,8 +1536,11 @@ void bnx2x_get_rss_ind_table(struct bnx2x_rss_config_obj *rss_obj, ((MAX_MAC_CREDIT_E2 - GET_NUM_VFS_PER_PATH(bp) * VF_MAC_CREDIT_CNT) / \ func_num + GET_NUM_VFS_PER_PF(bp) * VF_MAC_CREDIT_CNT) +#define BNX2X_VFS_VLAN_CREDIT(bp) \ + (GET_NUM_VFS_PER_PATH(bp) * VF_VLAN_CREDIT_CNT) + #define PF_VLAN_CREDIT_E2(bp, func_num) \ - ((MAX_VLAN_CREDIT_E2 - GET_NUM_VFS_PER_PATH(bp) * VF_VLAN_CREDIT_CNT) /\ + ((MAX_VLAN_CREDIT_E2 - 1 - BNX2X_VFS_VLAN_CREDIT(bp)) / \ func_num + GET_NUM_VFS_PER_PF(bp) * VF_VLAN_CREDIT_CNT) #endif /* BNX2X_SP_VERBS */ From bb3d0b8bf5be61ab1d6f472c43cbf34de17e796b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Dec 2019 11:13:24 -0800 Subject: [PATCH 099/118] net_sched: sch_fq: properly set sk->sk_pacing_status If fq_classify() recycles a struct fq_flow because a socket structure has been reallocated, we do not set sk->sk_pacing_status immediately, but later if the flow becomes detached. This means that any flow requiring pacing (BBR, or SO_MAX_PACING_RATE) might fallback to TCP internal pacing, which requires a per-socket high resolution timer, and therefore more cpu cycles. Fixes: 218af599fa63 ("tcp: internal implementation for pacing") Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index b1c7e726ce5d..ff4c5e9d0d77 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -301,6 +301,9 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) f->socket_hash != sk->sk_hash)) { f->credit = q->initial_quantum; f->socket_hash = sk->sk_hash; + if (q->rate_enable) + smp_store_release(&sk->sk_pacing_status, + SK_PACING_FQ); if (fq_flow_is_throttled(f)) fq_flow_unset_throttled(q, f); f->time_next_packet = 0ULL; @@ -322,8 +325,12 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) fq_flow_set_detached(f); f->sk = sk; - if (skb->sk == sk) + if (skb->sk == sk) { f->socket_hash = sk->sk_hash; + if (q->rate_enable) + smp_store_release(&sk->sk_pacing_status, + SK_PACING_FQ); + } f->credit = q->initial_quantum; rb_link_node(&f->fq_node, parent, p); @@ -428,17 +435,9 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, f->qlen++; qdisc_qstats_backlog_inc(sch, skb); if (fq_flow_is_detached(f)) { - struct sock *sk = skb->sk; - fq_flow_add_tail(&q->new_flows, f); if (time_after(jiffies, f->age + q->flow_refill_delay)) f->credit = max_t(u32, f->credit, q->quantum); - if (sk && q->rate_enable) { - if (unlikely(smp_load_acquire(&sk->sk_pacing_status) != - SK_PACING_FQ)) - smp_store_release(&sk->sk_pacing_status, - SK_PACING_FQ); - } q->inactive_flows--; } From 70cf3dc7313207816255b9acb0dffb19dae78144 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Wed, 25 Dec 2019 10:51:01 +0200 Subject: [PATCH 100/118] net/sched: act_mirred: Pull mac prior redir to non mac_header_xmit device There's no skb_pull performed when a mirred action is set at egress of a mac device, with a target device/action that expects skb->data to point at the network header. As a result, either the target device is errornously given an skb with data pointing to the mac (egress case), or the net stack receives the skb with data pointing to the mac (ingress case). E.g: # tc qdisc add dev eth9 root handle 1: prio # tc filter add dev eth9 parent 1: prio 9 protocol ip handle 9 basic \ action mirred egress redirect dev tun0 (tun0 is a tun device. result: tun0 errornously gets the eth header instead of the iph) Revise the push/pull logic of tcf_mirred_act() to not rely on the skb_at_tc_ingress() vs tcf_mirred_act_wants_ingress() comparison, as it does not cover all "pull" cases. Instead, calculate whether the required action on the target device requires the data to point at the network header, and compare this to whether skb->data points to network header - and make the push/pull adjustments as necessary. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Shmulik Ladkani Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1e3eb3a97532..1ad300e6dbc0 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -219,8 +219,10 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, bool use_reinsert; bool want_ingress; bool is_redirect; + bool expects_nh; int m_eaction; int mac_len; + bool at_nh; rec_level = __this_cpu_inc_return(mirred_rec_level); if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) { @@ -261,19 +263,19 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, goto out; } - /* If action's target direction differs than filter's direction, - * and devices expect a mac header on xmit, then mac push/pull is - * needed. - */ want_ingress = tcf_mirred_act_wants_ingress(m_eaction); - if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) { - if (!skb_at_tc_ingress(skb)) { - /* caught at egress, act ingress: pull mac */ - mac_len = skb_network_header(skb) - skb_mac_header(skb); + + expects_nh = want_ingress || !m_mac_header_xmit; + at_nh = skb->data == skb_network_header(skb); + if (at_nh != expects_nh) { + mac_len = skb_at_tc_ingress(skb) ? skb->mac_len : + skb_network_header(skb) - skb_mac_header(skb); + if (expects_nh) { + /* target device/action expect data at nh */ skb_pull_rcsum(skb2, mac_len); } else { - /* caught at ingress, act egress: push mac */ - skb_push_rcsum(skb2, skb->mac_len); + /* target device/action expect data at mac */ + skb_push_rcsum(skb2, mac_len); } } From bd6f48546b9cb7a785344fc78058c420923d7ed8 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Thu, 26 Dec 2019 20:01:01 +0100 Subject: [PATCH 101/118] net: stmmac: dwmac-meson8b: Fix the RGMII TX delay on Meson8b/8m2 SoCs GXBB and newer SoCs use the fixed FCLK_DIV2 (1GHz) clock as input for the m250_sel clock. Meson8b and Meson8m2 use MPLL2 instead, whose rate can be adjusted at runtime. So far we have been running MPLL2 with ~250MHz (and the internal m250_div with value 1), which worked enough that we could transfer data with an TX delay of 4ns. Unfortunately there is high packet loss with an RGMII PHY when transferring data (receiving data works fine though). Odroid-C1's u-boot is running with a TX delay of only 2ns as well as the internal m250_div set to 2 - no lost (TX) packets can be observed with that setting in u-boot. Manual testing has shown that the TX packet loss goes away when using the following settings in Linux (the vendor kernel uses the same settings): - MPLL2 clock set to ~500MHz - m250_div set to 2 - TX delay set to 2ns on the MAC side Update the m250_div divider settings to only accept dividers greater or equal 2 to fix the TX delay generated by the MAC. iperf3 results before the change: [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-10.00 sec 182 MBytes 153 Mbits/sec 514 sender [ 5] 0.00-10.00 sec 182 MBytes 152 Mbits/sec receiver iperf3 results after the change (including an updated TX delay of 2ns): [ ID] Interval Transfer Bitrate Retr Cwnd [ 5] 0.00-10.00 sec 927 MBytes 778 Mbits/sec 0 sender [ 5] 0.00-10.01 sec 927 MBytes 777 Mbits/sec receiver Fixes: 4f6a71b84e1afd ("net: stmmac: dwmac-meson8b: fix internal RGMII clock configuration") Signed-off-by: Martin Blumenstingl Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c index bd6c01004913..0e2fa14f1423 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c @@ -112,6 +112,14 @@ static int meson8b_init_rgmii_tx_clk(struct meson8b_dwmac *dwmac) struct device *dev = dwmac->dev; const char *parent_name, *mux_parent_names[MUX_CLK_NUM_PARENTS]; struct meson8b_dwmac_clk_configs *clk_configs; + static const struct clk_div_table div_table[] = { + { .div = 2, .val = 2, }, + { .div = 3, .val = 3, }, + { .div = 4, .val = 4, }, + { .div = 5, .val = 5, }, + { .div = 6, .val = 6, }, + { .div = 7, .val = 7, }, + }; clk_configs = devm_kzalloc(dev, sizeof(*clk_configs), GFP_KERNEL); if (!clk_configs) @@ -146,9 +154,9 @@ static int meson8b_init_rgmii_tx_clk(struct meson8b_dwmac *dwmac) clk_configs->m250_div.reg = dwmac->regs + PRG_ETH0; clk_configs->m250_div.shift = PRG_ETH0_CLK_M250_DIV_SHIFT; clk_configs->m250_div.width = PRG_ETH0_CLK_M250_DIV_WIDTH; - clk_configs->m250_div.flags = CLK_DIVIDER_ONE_BASED | - CLK_DIVIDER_ALLOW_ZERO | - CLK_DIVIDER_ROUND_CLOSEST; + clk_configs->m250_div.table = div_table; + clk_configs->m250_div.flags = CLK_DIVIDER_ALLOW_ZERO | + CLK_DIVIDER_ROUND_CLOSEST; clk = meson8b_dwmac_register_clk(dwmac, "m250_div", &parent_name, 1, &clk_divider_ops, &clk_configs->m250_div.hw); From 4d47ce158efb7b145c680e79cd33e3c6fd773de4 Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Sun, 22 Dec 2019 10:26:04 +0100 Subject: [PATCH 102/118] riscv: fix compile failure with EXPORT_SYMBOL() & !MMU When support for !MMU was added, the declaration of __asm_copy_to_user() & __asm_copy_from_user() were #ifdefed out hence their EXPORT_SYMBOL() give an error message like: .../riscv_ksyms.c:13:15: error: '__asm_copy_to_user' undeclared here .../riscv_ksyms.c:14:15: error: '__asm_copy_from_user' undeclared here Since these symbols are not defined with !MMU it's wrong to export them. Same for __clear_user() (even though this one is also declared in include/asm-generic/uaccess.h and thus doesn't give an error message). Fix this by doing the EXPORT_SYMBOL() directly where these symbols are defined: inside lib/uaccess.S itself. Fixes: 6bd33e1ece52 ("riscv: fix compile failure with EXPORT_SYMBOL() & !MMU") Reported-by: kbuild test robot Cc: Christoph Hellwig Cc: Palmer Dabbelt Cc: Paul Walmsley Signed-off-by: Luc Van Oostenryck Signed-off-by: Paul Walmsley --- arch/riscv/kernel/riscv_ksyms.c | 3 --- arch/riscv/lib/uaccess.S | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c index 4800cf703186..2a02b7eebee0 100644 --- a/arch/riscv/kernel/riscv_ksyms.c +++ b/arch/riscv/kernel/riscv_ksyms.c @@ -9,8 +9,5 @@ /* * Assembly functions that may be used (directly or indirectly) by modules */ -EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(__asm_copy_to_user); -EXPORT_SYMBOL(__asm_copy_from_user); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S index fecd65657a6f..f29d2ba2c0a6 100644 --- a/arch/riscv/lib/uaccess.S +++ b/arch/riscv/lib/uaccess.S @@ -1,4 +1,5 @@ #include +#include #include #include @@ -66,6 +67,8 @@ ENTRY(__asm_copy_from_user) j 3b ENDPROC(__asm_copy_to_user) ENDPROC(__asm_copy_from_user) +EXPORT_SYMBOL(__asm_copy_to_user) +EXPORT_SYMBOL(__asm_copy_from_user) ENTRY(__clear_user) @@ -108,6 +111,7 @@ ENTRY(__clear_user) bltu a0, a3, 5b j 3b ENDPROC(__clear_user) +EXPORT_SYMBOL(__clear_user) .section .fixup,"ax" .balign 4 From 556f47ac6083d778843e89aa21b1242eee2693ed Mon Sep 17 00:00:00 2001 From: David Abdurachmanov Date: Wed, 18 Dec 2019 10:47:56 +0200 Subject: [PATCH 103/118] riscv: reject invalid syscalls below -1 Running "stress-ng --enosys 4 -t 20 -v" showed a large number of kernel oops with "Unable to handle kernel paging request at virtual address" message. This happens when enosys stressor starts testing random non-valid syscalls. I forgot to redirect any syscall below -1 to sys_ni_syscall. With the patch kernel oops messages are gone while running stress-ng enosys stressor. Signed-off-by: David Abdurachmanov Fixes: 5340627e3fe0 ("riscv: add support for SECCOMP and SECCOMP_FILTER") Signed-off-by: Paul Walmsley --- arch/riscv/kernel/entry.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index a1349ca64669..e163b7b64c86 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -246,6 +246,7 @@ check_syscall_nr: */ li t1, -1 beq a7, t1, ret_from_syscall_rejected + blt a7, t1, 1f /* Call syscall */ la s0, sys_call_table slli t0, a7, RISCV_LGPTR From 1833e327a5ea1d1f356fbf6ded0760c9ff4b0594 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Mon, 16 Dec 2019 20:07:04 -0800 Subject: [PATCH 104/118] riscv: export flush_icache_all to modules This is needed by LKDTM (crash dump test module), it calls flush_icache_range(), which on RISC-V turns into flush_icache_all(). On other architectures, the actual implementation is exported, so follow that precedence and export it here too. Fixes build of CONFIG_LKDTM that fails with: ERROR: "flush_icache_all" [drivers/misc/lkdtm/lkdtm.ko] undefined! Signed-off-by: Olof Johansson Signed-off-by: Paul Walmsley --- arch/riscv/mm/cacheflush.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 8f1900686640..8930ab7278e6 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -22,6 +22,7 @@ void flush_icache_all(void) else on_each_cpu(ipi_remote_fence_i, NULL, 1); } +EXPORT_SYMBOL(flush_icache_all); /* * Performs an icache flush for the given MM context. RISC-V has no direct From 98ca480a8f22fdbd768e3dad07024c8d4856576c Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 22 Dec 2019 20:45:28 +0200 Subject: [PATCH 105/118] locks: print unsigned ino in /proc/locks An ino is unsigned, so display it as such in /proc/locks. Cc: stable@vger.kernel.org Signed-off-by: Amir Goldstein Signed-off-by: Jeff Layton --- fs/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/locks.c b/fs/locks.c index 6970f55daf54..44b6da032842 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2853,7 +2853,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } if (inode) { /* userspace relies on this representation of dev_t */ - seq_printf(f, "%d %02x:%02x:%ld ", fl_pid, + seq_printf(f, "%d %02x:%02x:%lu ", fl_pid, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); } else { From 314bd842d98e1035cc40b671a71e07f48420e58f Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Sun, 29 Dec 2019 13:40:22 +0200 Subject: [PATCH 106/118] mlxsw: spectrum_router: Skip loopback RIFs during MAC validation When a router interface (RIF) is created the MAC address of the backing netdev is verified to have the same MSBs as existing RIFs. This is required in order to avoid changing existing RIF MAC addresses that all share the same MSBs. Loopback RIFs are special in this regard as they do not have a MAC address, given they are only used to loop packets from the overlay to the underlay. Without this change, an error is returned when trying to create a RIF after the creation of a GRE tunnel that is represented by a loopback RIF. 'rif->dev->dev_addr' points to the GRE device's local IP, which does not share the same MSBs as physical interfaces. Adding an IP address to any physical interface results in: Error: mlxsw_spectrum: All router interface MAC addresses must have the same prefix. Fix this by skipping loopback RIFs during MAC validation. Fixes: 74bc99397438 ("mlxsw: spectrum_router: Veto unsupported RIF MAC addresses") Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 08b7e9f964da..8290e82240fc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -7079,6 +7079,9 @@ static int mlxsw_sp_router_port_check_rif_addr(struct mlxsw_sp *mlxsw_sp, for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) { rif = mlxsw_sp->router->rifs[i]; + if (rif && rif->ops && + rif->ops->type == MLXSW_SP_RIF_TYPE_IPIP_LB) + continue; if (rif && rif->dev && rif->dev != dev && !ether_addr_equal_masked(rif->dev->dev_addr, dev_addr, mlxsw_sp->mac_mask)) { From acca789a358cc960be3937851d7de6591c79d6c2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 29 Dec 2019 13:40:23 +0200 Subject: [PATCH 107/118] mlxsw: spectrum: Use dedicated policer for VRRP packets Currently, VRRP packets and packets that hit exceptions during routing (e.g., MTU error) are policed using the same policer towards the CPU. This means, for example, that misconfiguration of the MTU on a routed interface can prevent VRRP packets from reaching the CPU, which in turn can cause the VRRP daemon to assume it is the Master router. Fix this by using a dedicated policer for VRRP packets. Fixes: 11566d34f895 ("mlxsw: spectrum: Add VRRP traps") Signed-off-by: Ido Schimmel Reported-by: Alex Veber Tested-by: Alex Veber Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 1 + drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 5294a1622643..af30e8a76682 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -5472,6 +5472,7 @@ enum mlxsw_reg_htgt_trap_group { MLXSW_REG_HTGT_TRAP_GROUP_SP_LBERROR, MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP0, MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP1, + MLXSW_REG_HTGT_TRAP_GROUP_SP_VRRP, __MLXSW_REG_HTGT_TRAP_GROUP_MAX, MLXSW_REG_HTGT_TRAP_GROUP_MAX = __MLXSW_REG_HTGT_TRAP_GROUP_MAX - 1 diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 556dca328bb5..f7fd5e8fbf96 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4542,8 +4542,8 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV6, TRAP_TO_CPU, ROUTER_EXP, false), MLXSW_SP_RXL_MARK(IPIP_DECAP_ERROR, TRAP_TO_CPU, ROUTER_EXP, false), MLXSW_SP_RXL_MARK(DECAP_ECN0, TRAP_TO_CPU, ROUTER_EXP, false), - MLXSW_SP_RXL_MARK(IPV4_VRRP, TRAP_TO_CPU, ROUTER_EXP, false), - MLXSW_SP_RXL_MARK(IPV6_VRRP, TRAP_TO_CPU, ROUTER_EXP, false), + MLXSW_SP_RXL_MARK(IPV4_VRRP, TRAP_TO_CPU, VRRP, false), + MLXSW_SP_RXL_MARK(IPV6_VRRP, TRAP_TO_CPU, VRRP, false), /* PKT Sample trap */ MLXSW_RXL(mlxsw_sp_rx_listener_sample_func, PKT_SAMPLE, MIRROR_TO_CPU, false, SP_IP2ME, DISCARD), @@ -4626,6 +4626,10 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core) rate = 19 * 1024; burst_size = 12; break; + case MLXSW_REG_HTGT_TRAP_GROUP_SP_VRRP: + rate = 360; + burst_size = 7; + break; default: continue; } @@ -4665,6 +4669,7 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core) case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF: case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM: case MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP0: + case MLXSW_REG_HTGT_TRAP_GROUP_SP_VRRP: priority = 5; tc = 5; break; From fd6988496e79a6a4bdb514a4655d2920209eb85d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Dec 2019 15:29:16 -0800 Subject: [PATCH 108/118] Linux 5.5-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index caf14acf1953..b99d95df8075 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 5 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Kleptomaniac Octopus # *DOCUMENTATION* From 0caeaf6ad532f9be5a768a158627cb31921cc8b7 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Mon, 30 Dec 2019 18:14:08 +0530 Subject: [PATCH 109/118] cxgb4/cxgb4vf: fix flow control display for auto negotiation As per 802.3-2005, Section Two, Annex 28B, Table 28B-2 [1], when _only_ Rx pause is enabled, both symmetric and asymmetric pause towards local device must be enabled. Also, firmware returns the local device's flow control pause params as part of advertised capabilities and negotiated params as part of current link attributes. So, fix up ethtool's flow control pause params fetch logic to read from acaps, instead of linkattr. [1] https://standards.ieee.org/standard/802_3-2005.html Fixes: c3168cabe1af ("cxgb4/cxgbvf: Handle 32-bit fw port capabilities") Signed-off-by: Surendra Mobiya Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 1 + .../ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 4 ++-- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 23 +++++++++++-------- .../ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 4 ++-- .../ethernet/chelsio/cxgb4vf/t4vf_common.h | 1 + .../net/ethernet/chelsio/cxgb4vf/t4vf_hw.c | 20 +++++++++------- 6 files changed, 32 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index a70ac2097892..becee29f5df7 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -504,6 +504,7 @@ struct link_config { enum cc_pause requested_fc; /* flow control user has requested */ enum cc_pause fc; /* actual link flow control */ + enum cc_pause advertised_fc; /* actual advertised flow control */ enum cc_fec requested_fec; /* Forward Error Correction: */ enum cc_fec fec; /* requested and actual in use */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index 20ab3b6285a2..c837382ee522 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -807,8 +807,8 @@ static void get_pauseparam(struct net_device *dev, struct port_info *p = netdev_priv(dev); epause->autoneg = (p->link_cfg.requested_fc & PAUSE_AUTONEG) != 0; - epause->rx_pause = (p->link_cfg.fc & PAUSE_RX) != 0; - epause->tx_pause = (p->link_cfg.fc & PAUSE_TX) != 0; + epause->rx_pause = (p->link_cfg.advertised_fc & PAUSE_RX) != 0; + epause->tx_pause = (p->link_cfg.advertised_fc & PAUSE_TX) != 0; } static int set_pauseparam(struct net_device *dev, diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 19d18acfc9a6..844fdcf55118 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -4089,7 +4089,8 @@ static inline fw_port_cap32_t cc_to_fwcap_pause(enum cc_pause cc_pause) if (cc_pause & PAUSE_TX) fw_pause |= FW_PORT_CAP32_802_3_PAUSE; else - fw_pause |= FW_PORT_CAP32_802_3_ASM_DIR; + fw_pause |= FW_PORT_CAP32_802_3_ASM_DIR | + FW_PORT_CAP32_802_3_PAUSE; } else if (cc_pause & PAUSE_TX) { fw_pause |= FW_PORT_CAP32_802_3_ASM_DIR; } @@ -8563,17 +8564,17 @@ static fw_port_cap32_t lstatus_to_fwcap(u32 lstatus) void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl) { const struct fw_port_cmd *cmd = (const void *)rpl; - int action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16)); - struct adapter *adapter = pi->adapter; - struct link_config *lc = &pi->link_cfg; - int link_ok, linkdnrc; - enum fw_port_type port_type; - enum fw_port_module_type mod_type; - unsigned int speed, fc, fec; fw_port_cap32_t pcaps, acaps, lpacaps, linkattr; + struct link_config *lc = &pi->link_cfg; + struct adapter *adapter = pi->adapter; + unsigned int speed, fc, fec, adv_fc; + enum fw_port_module_type mod_type; + int action, link_ok, linkdnrc; + enum fw_port_type port_type; /* Extract the various fields from the Port Information message. */ + action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16)); switch (action) { case FW_PORT_ACTION_GET_PORT_INFO: { u32 lstatus = be32_to_cpu(cmd->u.info.lstatus_to_modtype); @@ -8611,6 +8612,7 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl) } fec = fwcap_to_cc_fec(acaps); + adv_fc = fwcap_to_cc_pause(acaps); fc = fwcap_to_cc_pause(linkattr); speed = fwcap_to_speed(linkattr); @@ -8667,7 +8669,9 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl) } if (link_ok != lc->link_ok || speed != lc->speed || - fc != lc->fc || fec != lc->fec) { /* something changed */ + fc != lc->fc || adv_fc != lc->advertised_fc || + fec != lc->fec) { + /* something changed */ if (!link_ok && lc->link_ok) { lc->link_down_rc = linkdnrc; dev_warn_ratelimited(adapter->pdev_dev, @@ -8677,6 +8681,7 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl) } lc->link_ok = link_ok; lc->speed = speed; + lc->advertised_fc = adv_fc; lc->fc = fc; lc->fec = fec; diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c index f6fc0875d5b0..f4d41f968afa 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c @@ -1690,8 +1690,8 @@ static void cxgb4vf_get_pauseparam(struct net_device *dev, struct port_info *pi = netdev_priv(dev); pauseparam->autoneg = (pi->link_cfg.requested_fc & PAUSE_AUTONEG) != 0; - pauseparam->rx_pause = (pi->link_cfg.fc & PAUSE_RX) != 0; - pauseparam->tx_pause = (pi->link_cfg.fc & PAUSE_TX) != 0; + pauseparam->rx_pause = (pi->link_cfg.advertised_fc & PAUSE_RX) != 0; + pauseparam->tx_pause = (pi->link_cfg.advertised_fc & PAUSE_TX) != 0; } /* diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h index ccca67cf4487..57cfd10a99ec 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h +++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h @@ -135,6 +135,7 @@ struct link_config { enum cc_pause requested_fc; /* flow control user has requested */ enum cc_pause fc; /* actual link flow control */ + enum cc_pause advertised_fc; /* actual advertised flow control */ enum cc_fec auto_fec; /* Forward Error Correction: */ enum cc_fec requested_fec; /* "automatic" (IEEE 802.3), */ diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c index 8a389d617a23..9d49ff211cc1 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c @@ -1913,16 +1913,16 @@ static const char *t4vf_link_down_rc_str(unsigned char link_down_rc) static void t4vf_handle_get_port_info(struct port_info *pi, const struct fw_port_cmd *cmd) { - int action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16)); - struct adapter *adapter = pi->adapter; - struct link_config *lc = &pi->link_cfg; - int link_ok, linkdnrc; - enum fw_port_type port_type; - enum fw_port_module_type mod_type; - unsigned int speed, fc, fec; fw_port_cap32_t pcaps, acaps, lpacaps, linkattr; + struct link_config *lc = &pi->link_cfg; + struct adapter *adapter = pi->adapter; + unsigned int speed, fc, fec, adv_fc; + enum fw_port_module_type mod_type; + int action, link_ok, linkdnrc; + enum fw_port_type port_type; /* Extract the various fields from the Port Information message. */ + action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16)); switch (action) { case FW_PORT_ACTION_GET_PORT_INFO: { u32 lstatus = be32_to_cpu(cmd->u.info.lstatus_to_modtype); @@ -1982,6 +1982,7 @@ static void t4vf_handle_get_port_info(struct port_info *pi, } fec = fwcap_to_cc_fec(acaps); + adv_fc = fwcap_to_cc_pause(acaps); fc = fwcap_to_cc_pause(linkattr); speed = fwcap_to_speed(linkattr); @@ -2012,7 +2013,9 @@ static void t4vf_handle_get_port_info(struct port_info *pi, } if (link_ok != lc->link_ok || speed != lc->speed || - fc != lc->fc || fec != lc->fec) { /* something changed */ + fc != lc->fc || adv_fc != lc->advertised_fc || + fec != lc->fec) { + /* something changed */ if (!link_ok && lc->link_ok) { lc->link_down_rc = linkdnrc; dev_warn_ratelimited(adapter->pdev_dev, @@ -2022,6 +2025,7 @@ static void t4vf_handle_get_port_info(struct port_info *pi, } lc->link_ok = link_ok; lc->speed = speed; + lc->advertised_fc = adv_fc; lc->fc = fc; lc->fec = fec; From 9fcf024dd6fae082f05e8c1fcdae23972b2f6971 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 27 Dec 2019 02:59:54 +0200 Subject: [PATCH 110/118] net: dsa: sja1105: Take PTP egress timestamp by port, not mgmt slot The PTP egress timestamp N must be captured from register PTPEGR_TS[n], where n = 2 * PORT + TSREG. There are 10 PTPEGR_TS registers, 2 per port. We are only using TSREG=0. As opposed to the management slots, which are 4 in number (SJA1105_NUM_PORTS, minus the CPU port). Any management frame (which includes PTP frames) can be sent to any non-CPU port through any management slot. When the CPU port is not the last port (#4), there will be a mismatch between the slot and the port number. Luckily, the only mainline occurrence with this switch (arch/arm/boot/dts/ls1021a-tsn.dts) does have the CPU port as #4, so the issue did not manifest itself thus far. Fixes: 47ed985e97f5 ("net: dsa: sja1105: Add logic for TX timestamping") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 2 +- drivers/net/dsa/sja1105/sja1105_ptp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index a51ac088c0bc..86bbab166633 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1855,7 +1855,7 @@ static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port, if (!clone) goto out; - sja1105_ptp_txtstamp_skb(ds, slot, clone); + sja1105_ptp_txtstamp_skb(ds, port, clone); out: mutex_unlock(&priv->mgmt_lock); diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 54258a25031d..c0fda7db6271 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -659,7 +659,7 @@ void sja1105_ptp_clock_unregister(struct dsa_switch *ds) ptp_data->clock = NULL; } -void sja1105_ptp_txtstamp_skb(struct dsa_switch *ds, int slot, +void sja1105_ptp_txtstamp_skb(struct dsa_switch *ds, int port, struct sk_buff *skb) { struct sja1105_private *priv = ds->priv; @@ -679,7 +679,7 @@ void sja1105_ptp_txtstamp_skb(struct dsa_switch *ds, int slot, goto out; } - rc = sja1105_ptpegr_ts_poll(ds, slot, &ts); + rc = sja1105_ptpegr_ts_poll(ds, port, &ts); if (rc < 0) { dev_err(ds->dev, "timed out polling for tstamp\n"); kfree_skb(skb); From 5a47f588ee2366b2febdc822cdfdcf856cb0a777 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 27 Dec 2019 03:01:50 +0200 Subject: [PATCH 111/118] net: dsa: sja1105: Really make the PTP command read-write When activating tc-taprio offload on the switch ports, the TAS state machine will try to check whether it is running or not, but will find both the STARTED and STOPPED bits as false in the sja1105_tas_check_running function. So the function will return -EINVAL (an abnormal situation) and the kernel will keep printing this from the TAS FSM workqueue: [ 37.691971] sja1105 spi0.1: An operation returned -22 The reason is that the underlying function that gets called, sja1105_ptp_commit, does not actually do a SPI_READ, but a SPI_WRITE. So the command buffer remains initialized with zeroes instead of retrieving the hardware state. Fix that. Fixes: 41603d78b362 ("net: dsa: sja1105: Make the PTP command read-write") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_ptp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index c0fda7db6271..43ab7589d0d0 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -234,7 +234,7 @@ int sja1105_ptp_commit(struct dsa_switch *ds, struct sja1105_ptp_cmd *cmd, if (rw == SPI_WRITE) priv->info->ptp_cmd_packing(buf, cmd, PACK); - rc = sja1105_xfer_buf(priv, SPI_WRITE, regs->ptp_control, buf, + rc = sja1105_xfer_buf(priv, rw, regs->ptp_control, buf, SJA1105_SIZE_PTP_CMD); if (rw == SPI_READ) From d00bdc0a8839de9a5c9be5af2a79dbf8e0087689 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 27 Dec 2019 03:03:54 +0200 Subject: [PATCH 112/118] net: dsa: sja1105: Remove restriction of zero base-time for taprio offload The check originates from the initial implementation which was not based on PTP time but on a standalone clock source. In the meantime we can now program the PTPSCHTM register at runtime with the dynamic base time (actually with a value that is 200 ns smaller, to avoid writing DELTA=0 in the Schedule Entry Points Parameters Table). And we also have logic for moving the actual base time in the future of the PHC's current time base, so the check for zero serves no purpose, since even if the user will specify zero, that's not what will end up in the static config table where the limitation is. Fixes: 86db36a347b4 ("net: dsa: sja1105: Implement state machine for TAS with PTP clock source") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_tas.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_tas.c b/drivers/net/dsa/sja1105/sja1105_tas.c index 26b925b5dace..fa6750d973d7 100644 --- a/drivers/net/dsa/sja1105/sja1105_tas.c +++ b/drivers/net/dsa/sja1105/sja1105_tas.c @@ -477,11 +477,6 @@ int sja1105_setup_tc_taprio(struct dsa_switch *ds, int port, if (admin->cycle_time_extension) return -ENOTSUPP; - if (!ns_to_sja1105_delta(admin->base_time)) { - dev_err(ds->dev, "A base time of zero is not hardware-allowed\n"); - return -ERANGE; - } - for (i = 0; i < admin->num_entries; i++) { s64 delta_ns = admin->entries[i].interval; s64 delta_cycles = ns_to_sja1105_delta(delta_ns); From 3a323ed7c9c6d60af05bada0efe7a8ef56cf317f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 27 Dec 2019 03:08:07 +0200 Subject: [PATCH 113/118] Documentation: net: dsa: sja1105: Remove text about taprio base-time limitation Since commit 86db36a347b4 ("net: dsa: sja1105: Implement state machine for TAS with PTP clock source"), this paragraph is no longer true. So remove it. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- Documentation/networking/dsa/sja1105.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Documentation/networking/dsa/sja1105.rst b/Documentation/networking/dsa/sja1105.rst index eef20d0bcf7c..64553d8d91cb 100644 --- a/Documentation/networking/dsa/sja1105.rst +++ b/Documentation/networking/dsa/sja1105.rst @@ -230,12 +230,6 @@ simultaneously on two ports. The driver checks the consistency of the schedules against this restriction and errors out when appropriate. Schedule analysis is needed to avoid this, which is outside the scope of the document. -At the moment, the time-aware scheduler can only be triggered based on a -standalone clock and not based on PTP time. This means the base-time argument -from tc-taprio is ignored and the schedule starts right away. It also means it -is more difficult to phase-align the scheduler with the other devices in the -network. - Device Tree bindings and board design ===================================== From 54fa49ee88138756df0fcf867cb1849904710a8c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 27 Dec 2019 03:11:13 +0200 Subject: [PATCH 114/118] net: dsa: sja1105: Reconcile the meaning of TPID and TPID2 for E/T and P/Q/R/S For first-generation switches (SJA1105E and SJA1105T): - TPID means C-Tag (typically 0x8100) - TPID2 means S-Tag (typically 0x88A8) While for the second generation switches (SJA1105P, SJA1105Q, SJA1105R, SJA1105S) it is the other way around: - TPID means S-Tag (typically 0x88A8) - TPID2 means C-Tag (typically 0x8100) In other words, E/T tags untagged traffic with TPID, and P/Q/R/S with TPID2. So the patch mentioned below fixed VLAN filtering for P/Q/R/S, but broke it for E/T. We strive for a common code path for all switches in the family, so just lie in the static config packing functions that TPID and TPID2 are at swapped bit offsets than they actually are, for P/Q/R/S. This will make both switches understand TPID to be ETH_P_8021Q and TPID2 to be ETH_P_8021AD. The meaning from the original E/T was chosen over P/Q/R/S because E/T is actually the one with public documentation available (UM10944.pdf). Fixes: f9a1a7646c0d ("net: dsa: sja1105: Reverse TPID and TPID2") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 8 ++++---- drivers/net/dsa/sja1105/sja1105_static_config.c | 7 +++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 86bbab166633..1da5ac111499 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1569,8 +1569,8 @@ static int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) if (enabled) { /* Enable VLAN filtering. */ - tpid = ETH_P_8021AD; - tpid2 = ETH_P_8021Q; + tpid = ETH_P_8021Q; + tpid2 = ETH_P_8021AD; } else { /* Disable VLAN filtering. */ tpid = ETH_P_SJA1105; @@ -1579,9 +1579,9 @@ static int sja1105_vlan_filtering(struct dsa_switch *ds, int port, bool enabled) table = &priv->static_config.tables[BLK_IDX_GENERAL_PARAMS]; general_params = table->entries; - /* EtherType used to identify outer tagged (S-tag) VLAN traffic */ - general_params->tpid = tpid; /* EtherType used to identify inner tagged (C-tag) VLAN traffic */ + general_params->tpid = tpid; + /* EtherType used to identify outer tagged (S-tag) VLAN traffic */ general_params->tpid2 = tpid2; /* When VLAN filtering is on, we need to at least be able to * decode management traffic through the "backup plan". diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.c b/drivers/net/dsa/sja1105/sja1105_static_config.c index 0d03e13e9909..63d2311817c4 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.c +++ b/drivers/net/dsa/sja1105/sja1105_static_config.c @@ -142,6 +142,9 @@ static size_t sja1105et_general_params_entry_packing(void *buf, void *entry_ptr, return size; } +/* TPID and TPID2 are intentionally reversed so that semantic + * compatibility with E/T is kept. + */ static size_t sja1105pqrs_general_params_entry_packing(void *buf, void *entry_ptr, enum packing_op op) @@ -166,9 +169,9 @@ sja1105pqrs_general_params_entry_packing(void *buf, void *entry_ptr, sja1105_packing(buf, &entry->mirr_port, 141, 139, size, op); sja1105_packing(buf, &entry->vlmarker, 138, 107, size, op); sja1105_packing(buf, &entry->vlmask, 106, 75, size, op); - sja1105_packing(buf, &entry->tpid, 74, 59, size, op); + sja1105_packing(buf, &entry->tpid2, 74, 59, size, op); sja1105_packing(buf, &entry->ignore2stf, 58, 58, size, op); - sja1105_packing(buf, &entry->tpid2, 57, 42, size, op); + sja1105_packing(buf, &entry->tpid, 57, 42, size, op); sja1105_packing(buf, &entry->queue_ts, 41, 41, size, op); sja1105_packing(buf, &entry->egrmirrvid, 40, 29, size, op); sja1105_packing(buf, &entry->egrmirrpcp, 28, 26, size, op); From a33121e5487b424339636b25c35d3a180eaa5f5e Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Fri, 27 Dec 2019 03:26:27 +0100 Subject: [PATCH 115/118] ptp: fix the race between the release of ptp_clock and cdev In a case when a ptp chardev (like /dev/ptp0) is open but an underlying device is removed, closing this file leads to a race. This reproduces easily in a kvm virtual machine: ts# cat openptp0.c int main() { ... fp = fopen("/dev/ptp0", "r"); ... sleep(10); } ts# uname -r 5.5.0-rc3-46cf053e ts# cat /proc/cmdline ... slub_debug=FZP ts# modprobe ptp_kvm ts# ./openptp0 & [1] 670 opened /dev/ptp0, sleeping 10s... ts# rmmod ptp_kvm ts# ls /dev/ptp* ls: cannot access '/dev/ptp*': No such file or directory ts# ...woken up [ 48.010809] general protection fault: 0000 [#1] SMP [ 48.012502] CPU: 6 PID: 658 Comm: openptp0 Not tainted 5.5.0-rc3-46cf053e #25 [ 48.014624] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ... [ 48.016270] RIP: 0010:module_put.part.0+0x7/0x80 [ 48.017939] RSP: 0018:ffffb3850073be00 EFLAGS: 00010202 [ 48.018339] RAX: 000000006b6b6b6b RBX: 6b6b6b6b6b6b6b6b RCX: ffff89a476c00ad0 [ 48.018936] RDX: fffff65a08d3ea08 RSI: 0000000000000247 RDI: 6b6b6b6b6b6b6b6b [ 48.019470] ... ^^^ a slub poison [ 48.023854] Call Trace: [ 48.024050] __fput+0x21f/0x240 [ 48.024288] task_work_run+0x79/0x90 [ 48.024555] do_exit+0x2af/0xab0 [ 48.024799] ? vfs_write+0x16a/0x190 [ 48.025082] do_group_exit+0x35/0x90 [ 48.025387] __x64_sys_exit_group+0xf/0x10 [ 48.025737] do_syscall_64+0x3d/0x130 [ 48.026056] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 48.026479] RIP: 0033:0x7f53b12082f6 [ 48.026792] ... [ 48.030945] Modules linked in: ptp i6300esb watchdog [last unloaded: ptp_kvm] [ 48.045001] Fixing recursive fault but reboot is needed! This happens in: static void __fput(struct file *file) { ... if (file->f_op->release) file->f_op->release(inode, file); <<< cdev is kfree'd here if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); <<< cdev fields are accessed here Namely: __fput() posix_clock_release() kref_put(&clk->kref, delete_clock) <<< the last reference delete_clock() delete_ptp_clock() kfree(ptp) <<< cdev is embedded in ptp cdev_put module_put(p->owner) <<< *p is kfree'd, bang! Here cdev is embedded in posix_clock which is embedded in ptp_clock. The race happens because ptp_clock's lifetime is controlled by two refcounts: kref and cdev.kobj in posix_clock. This is wrong. Make ptp_clock's sysfs device a parent of cdev with cdev_device_add() created especially for such cases. This way the parent device with its ptp_clock is not released until all references to the cdev are released. This adds a requirement that an initialized but not exposed struct device should be provided to posix_clock_register() by a caller instead of a simple dev_t. This approach was adopted from the commit 72139dfa2464 ("watchdog: Fix the race between the release of watchdog_core_data and cdev"). See details of the implementation in the commit 233ed09d7fda ("chardev: add helper function to register char devs with a struct device"). Link: https://lore.kernel.org/linux-fsdevel/20191125125342.6189-1-vdronov@redhat.com/T/#u Analyzed-by: Stephen Johnston Analyzed-by: Vern Lovejoy Signed-off-by: Vladis Dronov Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_clock.c | 31 ++++++++++++++----------------- drivers/ptp/ptp_private.h | 2 +- include/linux/posix-clock.h | 19 +++++++++++-------- kernel/time/posix-clock.c | 31 +++++++++++++------------------ 4 files changed, 39 insertions(+), 44 deletions(-) diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index e60eab7f8a61..61fafe0374ce 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -166,9 +166,9 @@ static struct posix_clock_operations ptp_clock_ops = { .read = ptp_read, }; -static void delete_ptp_clock(struct posix_clock *pc) +static void ptp_clock_release(struct device *dev) { - struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock); + struct ptp_clock *ptp = container_of(dev, struct ptp_clock, dev); mutex_destroy(&ptp->tsevq_mux); mutex_destroy(&ptp->pincfg_mux); @@ -213,7 +213,6 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, } ptp->clock.ops = ptp_clock_ops; - ptp->clock.release = delete_ptp_clock; ptp->info = info; ptp->devid = MKDEV(major, index); ptp->index = index; @@ -236,15 +235,6 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, if (err) goto no_pin_groups; - /* Create a new device in our class. */ - ptp->dev = device_create_with_groups(ptp_class, parent, ptp->devid, - ptp, ptp->pin_attr_groups, - "ptp%d", ptp->index); - if (IS_ERR(ptp->dev)) { - err = PTR_ERR(ptp->dev); - goto no_device; - } - /* Register a new PPS source. */ if (info->pps) { struct pps_source_info pps; @@ -260,8 +250,18 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, } } - /* Create a posix clock. */ - err = posix_clock_register(&ptp->clock, ptp->devid); + /* Initialize a new device of our class in our clock structure. */ + device_initialize(&ptp->dev); + ptp->dev.devt = ptp->devid; + ptp->dev.class = ptp_class; + ptp->dev.parent = parent; + ptp->dev.groups = ptp->pin_attr_groups; + ptp->dev.release = ptp_clock_release; + dev_set_drvdata(&ptp->dev, ptp); + dev_set_name(&ptp->dev, "ptp%d", ptp->index); + + /* Create a posix clock and link it to the device. */ + err = posix_clock_register(&ptp->clock, &ptp->dev); if (err) { pr_err("failed to create posix clock\n"); goto no_clock; @@ -273,8 +273,6 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, if (ptp->pps_source) pps_unregister_source(ptp->pps_source); no_pps: - device_destroy(ptp_class, ptp->devid); -no_device: ptp_cleanup_pin_groups(ptp); no_pin_groups: if (ptp->kworker) @@ -304,7 +302,6 @@ int ptp_clock_unregister(struct ptp_clock *ptp) if (ptp->pps_source) pps_unregister_source(ptp->pps_source); - device_destroy(ptp_class, ptp->devid); ptp_cleanup_pin_groups(ptp); posix_clock_unregister(&ptp->clock); diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h index 9171d42468fd..6b97155148f1 100644 --- a/drivers/ptp/ptp_private.h +++ b/drivers/ptp/ptp_private.h @@ -28,7 +28,7 @@ struct timestamp_event_queue { struct ptp_clock { struct posix_clock clock; - struct device *dev; + struct device dev; struct ptp_clock_info *info; dev_t devid; int index; /* index into clocks.map */ diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h index fe6cfdcfbc26..468328b1e1dd 100644 --- a/include/linux/posix-clock.h +++ b/include/linux/posix-clock.h @@ -69,29 +69,32 @@ struct posix_clock_operations { * * @ops: Functional interface to the clock * @cdev: Character device instance for this clock - * @kref: Reference count. + * @dev: Pointer to the clock's device. * @rwsem: Protects the 'zombie' field from concurrent access. * @zombie: If 'zombie' is true, then the hardware has disappeared. - * @release: A function to free the structure when the reference count reaches - * zero. May be NULL if structure is statically allocated. * * Drivers should embed their struct posix_clock within a private * structure, obtaining a reference to it during callbacks using * container_of(). + * + * Drivers should supply an initialized but not exposed struct device + * to posix_clock_register(). It is used to manage lifetime of the + * driver's private structure. It's 'release' field should be set to + * a release function for this private structure. */ struct posix_clock { struct posix_clock_operations ops; struct cdev cdev; - struct kref kref; + struct device *dev; struct rw_semaphore rwsem; bool zombie; - void (*release)(struct posix_clock *clk); }; /** * posix_clock_register() - register a new clock - * @clk: Pointer to the clock. Caller must provide 'ops' and 'release' - * @devid: Allocated device id + * @clk: Pointer to the clock. Caller must provide 'ops' field + * @dev: Pointer to the initialized device. Caller must provide + * 'release' field * * A clock driver calls this function to register itself with the * clock device subsystem. If 'clk' points to dynamically allocated @@ -100,7 +103,7 @@ struct posix_clock { * * Returns zero on success, non-zero otherwise. */ -int posix_clock_register(struct posix_clock *clk, dev_t devid); +int posix_clock_register(struct posix_clock *clk, struct device *dev); /** * posix_clock_unregister() - unregister a clock diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index ec960bb939fd..200fb2d3be99 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -14,8 +14,6 @@ #include "posix-timers.h" -static void delete_clock(struct kref *kref); - /* * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. */ @@ -125,7 +123,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) err = 0; if (!err) { - kref_get(&clk->kref); + get_device(clk->dev); fp->private_data = clk; } out: @@ -141,7 +139,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp) if (clk->ops.release) err = clk->ops.release(clk); - kref_put(&clk->kref, delete_clock); + put_device(clk->dev); fp->private_data = NULL; @@ -161,38 +159,35 @@ static const struct file_operations posix_clock_file_operations = { #endif }; -int posix_clock_register(struct posix_clock *clk, dev_t devid) +int posix_clock_register(struct posix_clock *clk, struct device *dev) { int err; - kref_init(&clk->kref); init_rwsem(&clk->rwsem); cdev_init(&clk->cdev, &posix_clock_file_operations); + err = cdev_device_add(&clk->cdev, dev); + if (err) { + pr_err("%s unable to add device %d:%d\n", + dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt)); + return err; + } clk->cdev.owner = clk->ops.owner; - err = cdev_add(&clk->cdev, devid, 1); + clk->dev = dev; - return err; + return 0; } EXPORT_SYMBOL_GPL(posix_clock_register); -static void delete_clock(struct kref *kref) -{ - struct posix_clock *clk = container_of(kref, struct posix_clock, kref); - - if (clk->release) - clk->release(clk); -} - void posix_clock_unregister(struct posix_clock *clk) { - cdev_del(&clk->cdev); + cdev_device_del(&clk->cdev, clk->dev); down_write(&clk->rwsem); clk->zombie = true; up_write(&clk->rwsem); - kref_put(&clk->kref, delete_clock); + put_device(clk->dev); } EXPORT_SYMBOL_GPL(posix_clock_unregister); From 853697504de043ff0bfd815bd3a64de1dce73dc7 Mon Sep 17 00:00:00 2001 From: Cambda Zhu Date: Fri, 27 Dec 2019 16:52:37 +0800 Subject: [PATCH 116/118] tcp: Fix highest_sack and highest_sack_seq >From commit 50895b9de1d3 ("tcp: highest_sack fix"), the logic about setting tp->highest_sack to the head of the send queue was removed. Of course the logic is error prone, but it is logical. Before we remove the pointer to the highest sack skb and use the seq instead, we need to set tp->highest_sack to NULL when there is no skb after the last sack, and then replace NULL with the real skb when new skb inserted into the rtx queue, because the NULL means the highest sack seq is tp->snd_nxt. If tp->highest_sack is NULL and new data sent, the next ACK with sack option will increase tp->reordering unexpectedly. This patch sets tp->highest_sack to the tail of the rtx queue if it's NULL and new data is sent. The patch keeps the rule that the highest_sack can only be maintained by sack processing, except for this only case. Fixes: 50895b9de1d3 ("tcp: highest_sack fix") Signed-off-by: Cambda Zhu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1f7735ca8f22..58c92a7d671c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -72,6 +72,9 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); + if (tp->highest_sack == NULL) + tp->highest_sack = skb; + tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); From a5b72a083da197b493c7ed1e5730d62d3199f7d6 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Sat, 28 Dec 2019 16:36:58 +0100 Subject: [PATCH 117/118] net/sched: add delete_empty() to filters and use it in cls_flower Revert "net/sched: cls_u32: fix refcount leak in the error path of u32_change()", and fix the u32 refcount leak in a more generic way that preserves the semantic of rule dumping. On tc filters that don't support lockless insertion/removal, there is no need to guard against concurrent insertion when a removal is in progress. Therefore, for most of them we can avoid a full walk() when deleting, and just decrease the refcount, like it was done on older Linux kernels. This fixes situations where walk() was wrongly detecting a non-empty filter, like it happened with cls_u32 in the error path of change(), thus leading to failures in the following tdc selftests: 6aa7: (filter, u32) Add/Replace u32 with source match and invalid indev 6658: (filter, u32) Add/Replace u32 with custom hash table and invalid handle 74c2: (filter, u32) Add/Replace u32 filter with invalid hash table id On cls_flower, and on (future) lockless filters, this check is necessary: move all the check_empty() logic in a callback so that each filter can have its own implementation. For cls_flower, it's sufficient to check if no IDRs have been allocated. This reverts commit 275c44aa194b7159d1191817b20e076f55f0e620. Changes since v1: - document the need for delete_empty() when TCF_PROTO_OPS_DOIT_UNLOCKED is used, thanks to Vlad Buslov - implement delete_empty() without doing fl_walk(), thanks to Vlad Buslov - squash revert and new fix in a single patch, to be nice with bisect tests that run tdc on u32 filter, thanks to Dave Miller Fixes: 275c44aa194b ("net/sched: cls_u32: fix refcount leak in the error path of u32_change()") Fixes: 6676d5e416ee ("net: sched: set dedicated tcf_walker flag when tp is empty") Suggested-by: Jamal Hadi Salim Suggested-by: Vlad Buslov Signed-off-by: Davide Caratti Reviewed-by: Vlad Buslov Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 +++++ net/sched/cls_api.c | 31 +++++-------------------------- net/sched/cls_flower.c | 12 ++++++++++++ net/sched/cls_u32.c | 25 ------------------------- 4 files changed, 22 insertions(+), 51 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 144f264ea394..fceddf89592a 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -308,6 +308,7 @@ struct tcf_proto_ops { int (*delete)(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *); + bool (*delete_empty)(struct tcf_proto *tp); void (*walk)(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held); int (*reoffload)(struct tcf_proto *tp, bool add, @@ -336,6 +337,10 @@ struct tcf_proto_ops { int flags; }; +/* Classifiers setting TCF_PROTO_OPS_DOIT_UNLOCKED in tcf_proto_ops->flags + * are expected to implement tcf_proto_ops->delete_empty(), otherwise race + * conditions can occur when filters are inserted/deleted simultaneously. + */ enum tcf_proto_ops_flags { TCF_PROTO_OPS_DOIT_UNLOCKED = 1, }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 6a0eacafdb19..76e0d122616a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -308,33 +308,12 @@ static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held, tcf_proto_destroy(tp, rtnl_held, true, extack); } -static int walker_check_empty(struct tcf_proto *tp, void *fh, - struct tcf_walker *arg) +static bool tcf_proto_check_delete(struct tcf_proto *tp) { - if (fh) { - arg->nonempty = true; - return -1; - } - return 0; -} + if (tp->ops->delete_empty) + return tp->ops->delete_empty(tp); -static bool tcf_proto_is_empty(struct tcf_proto *tp, bool rtnl_held) -{ - struct tcf_walker walker = { .fn = walker_check_empty, }; - - if (tp->ops->walk) { - tp->ops->walk(tp, &walker, rtnl_held); - return !walker.nonempty; - } - return true; -} - -static bool tcf_proto_check_delete(struct tcf_proto *tp, bool rtnl_held) -{ - spin_lock(&tp->lock); - if (tcf_proto_is_empty(tp, rtnl_held)) - tp->deleting = true; - spin_unlock(&tp->lock); + tp->deleting = true; return tp->deleting; } @@ -1751,7 +1730,7 @@ static void tcf_chain_tp_delete_empty(struct tcf_chain *chain, * concurrently. * Mark tp for deletion if it is empty. */ - if (!tp_iter || !tcf_proto_check_delete(tp, rtnl_held)) { + if (!tp_iter || !tcf_proto_check_delete(tp)) { mutex_unlock(&chain->filter_chain_lock); return; } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 0d125de54285..b0f42e62dd76 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2773,6 +2773,17 @@ static void fl_bind_class(void *fh, u32 classid, unsigned long cl) f->res.class = cl; } +static bool fl_delete_empty(struct tcf_proto *tp) +{ + struct cls_fl_head *head = fl_head_dereference(tp); + + spin_lock(&tp->lock); + tp->deleting = idr_is_empty(&head->handle_idr); + spin_unlock(&tp->lock); + + return tp->deleting; +} + static struct tcf_proto_ops cls_fl_ops __read_mostly = { .kind = "flower", .classify = fl_classify, @@ -2782,6 +2793,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .put = fl_put, .change = fl_change, .delete = fl_delete, + .delete_empty = fl_delete_empty, .walk = fl_walk, .reoffload = fl_reoffload, .hw_add = fl_hw_add, diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 66c6bcec16cb..a0e6fac613de 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -1108,33 +1108,10 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return err; } -static bool u32_hnode_empty(struct tc_u_hnode *ht, bool *non_root_ht) -{ - int i; - - if (!ht) - return true; - if (!ht->is_root) { - *non_root_ht = true; - return false; - } - if (*non_root_ht) - return false; - if (ht->refcnt < 2) - return true; - - for (i = 0; i <= ht->divisor; i++) { - if (rtnl_dereference(ht->ht[i])) - return false; - } - return true; -} - static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct tc_u_common *tp_c = tp->data; - bool non_root_ht = false; struct tc_u_hnode *ht; struct tc_u_knode *n; unsigned int h; @@ -1147,8 +1124,6 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg, ht = rtnl_dereference(ht->next)) { if (ht->prio != tp->prio) continue; - if (u32_hnode_empty(ht, &non_root_ht)) - return; if (arg->count >= arg->skip) { if (arg->fn(tp, ht, arg) < 0) { arg->stop = 1; From 04b69426d846cd04ca9acefff1ea39e1c64d2714 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sat, 28 Dec 2019 16:28:09 +0000 Subject: [PATCH 118/118] hsr: fix slab-out-of-bounds Read in hsr_debugfs_rename() hsr slave interfaces don't have debugfs directory. So, hsr_debugfs_rename() shouldn't be called when hsr slave interface name is changed. Test commands: ip link add dummy0 type dummy ip link add dummy1 type dummy ip link add hsr0 type hsr slave1 dummy0 slave2 dummy1 ip link set dummy0 name ap Splat looks like: [21071.899367][T22666] ap: renamed from dummy0 [21071.914005][T22666] ================================================================== [21071.919008][T22666] BUG: KASAN: slab-out-of-bounds in hsr_debugfs_rename+0xaa/0xb0 [hsr] [21071.923640][T22666] Read of size 8 at addr ffff88805febcd98 by task ip/22666 [21071.926941][T22666] [21071.927750][T22666] CPU: 0 PID: 22666 Comm: ip Not tainted 5.5.0-rc2+ #240 [21071.929919][T22666] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [21071.935094][T22666] Call Trace: [21071.935867][T22666] dump_stack+0x96/0xdb [21071.936687][T22666] ? hsr_debugfs_rename+0xaa/0xb0 [hsr] [21071.937774][T22666] print_address_description.constprop.5+0x1be/0x360 [21071.939019][T22666] ? hsr_debugfs_rename+0xaa/0xb0 [hsr] [21071.940081][T22666] ? hsr_debugfs_rename+0xaa/0xb0 [hsr] [21071.940949][T22666] __kasan_report+0x12a/0x16f [21071.941758][T22666] ? hsr_debugfs_rename+0xaa/0xb0 [hsr] [21071.942674][T22666] kasan_report+0xe/0x20 [21071.943325][T22666] hsr_debugfs_rename+0xaa/0xb0 [hsr] [21071.944187][T22666] hsr_netdev_notify+0x1fe/0x9b0 [hsr] [21071.945052][T22666] ? __module_text_address+0x13/0x140 [21071.945897][T22666] notifier_call_chain+0x90/0x160 [21071.946743][T22666] dev_change_name+0x419/0x840 [21071.947496][T22666] ? __read_once_size_nocheck.constprop.6+0x10/0x10 [21071.948600][T22666] ? netdev_adjacent_rename_links+0x280/0x280 [21071.949577][T22666] ? __read_once_size_nocheck.constprop.6+0x10/0x10 [21071.950672][T22666] ? lock_downgrade+0x6e0/0x6e0 [21071.951345][T22666] ? do_setlink+0x811/0x2ef0 [21071.951991][T22666] do_setlink+0x811/0x2ef0 [21071.952613][T22666] ? is_bpf_text_address+0x81/0xe0 [ ... ] Reported-by: syzbot+9328206518f08318a5fd@syzkaller.appspotmail.com Fixes: 4c2d5e33dcd3 ("hsr: rename debugfs file when interface name is changed") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index d2ee7125a7f1..9e389accbfc7 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -46,7 +46,8 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, hsr_check_carrier_and_operstate(hsr); break; case NETDEV_CHANGENAME: - hsr_debugfs_rename(dev); + if (is_hsr_master(dev)) + hsr_debugfs_rename(dev); break; case NETDEV_CHANGEADDR: if (port->type == HSR_PT_MASTER) {