linux/drivers/infiniband/core/netlink.c

332 lines
8.6 KiB
C
Raw Normal View History

/*
* Copyright (c) 2017 Mellanox Technologies Inc. All rights reserved.
* Copyright (c) 2010 Voltaire Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
#include <linux/export.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
#include <rdma/rdma_netlink.h>
#include <linux/module.h>
#include "core_priv.h"
static struct {
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
const struct rdma_nl_cbs *cb_table;
/* Synchronizes between ongoing netlink commands and netlink client
* unregistration.
*/
struct rw_semaphore sem;
} rdma_nl_types[RDMA_NL_NUM_CLIENTS];
bool rdma_nl_chk_listeners(unsigned int group)
{
struct rdma_dev_net *rnet = rdma_net_to_dev_net(&init_net);
return netlink_has_listeners(rnet->nl_sock, group);
}
EXPORT_SYMBOL(rdma_nl_chk_listeners);
static bool is_nl_msg_valid(unsigned int type, unsigned int op)
{
static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = {
[RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS,
[RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS,
[RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS,
};
/*
* This BUILD_BUG_ON is intended to catch addition of new
* RDMA netlink protocol without updating the array above.
*/
BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6);
if (type >= RDMA_NL_NUM_CLIENTS)
return false;
return (op < max_num_ops[type]) ? true : false;
}
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
static const struct rdma_nl_cbs *
get_cb_table(const struct sk_buff *skb, unsigned int type, unsigned int op)
{
const struct rdma_nl_cbs *cb_table;
/*
* Currently only NLDEV client is supporting netlink commands in
* non init_net net namespace.
*/
if (sock_net(skb->sk) != &init_net && type != RDMA_NL_NLDEV)
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
return NULL;
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
cb_table = READ_ONCE(rdma_nl_types[type].cb_table);
if (!cb_table) {
/*
* Didn't get valid reference of the table, attempt module
* load once.
*/
up_read(&rdma_nl_types[type].sem);
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
request_module("rdma-netlink-subsys-%d", type);
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
down_read(&rdma_nl_types[type].sem);
cb_table = READ_ONCE(rdma_nl_types[type].cb_table);
}
if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit))
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
return NULL;
return cb_table;
}
void rdma_nl_register(unsigned int index,
const struct rdma_nl_cbs cb_table[])
{
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
if (WARN_ON(!is_nl_msg_valid(index, 0)) ||
WARN_ON(READ_ONCE(rdma_nl_types[index].cb_table)))
return;
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
/* Pairs with the READ_ONCE in is_nl_valid() */
smp_store_release(&rdma_nl_types[index].cb_table, cb_table);
}
EXPORT_SYMBOL(rdma_nl_register);
void rdma_nl_unregister(unsigned int index)
{
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
down_write(&rdma_nl_types[index].sem);
rdma_nl_types[index].cb_table = NULL;
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
up_write(&rdma_nl_types[index].sem);
}
EXPORT_SYMBOL(rdma_nl_unregister);
void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
RDMA/core: Add support for iWARP Port Mapper user space service This patch adds iWARP Port Mapper (IWPM) Version 2 support. The iWARP Port Mapper implementation is based on the port mapper specification section in the Sockets Direct Protocol paper - http://www.rdmaconsortium.org/home/draft-pinkerton-iwarp-sdp-v1.0.pdf Existing iWARP RDMA providers use the same IP address as the native TCP/IP stack when creating RDMA connections. They need a mechanism to claim the TCP ports used for RDMA connections to prevent TCP port collisions when other host applications use TCP ports. The iWARP Port Mapper provides a standard mechanism to accomplish this. Without this service it is possible for RDMA application to bind/listen on the same port which is already being used by native TCP host application. If that happens the incoming TCP connection data can be passed to the RDMA stack with error. The iWARP Port Mapper solution doesn't contain any changes to the existing network stack in the kernel space. All the changes are contained with the infiniband tree and also in user space. The iWARP Port Mapper service is implemented as a user space daemon process. Source for the IWPM service is located at http://git.openfabrics.org/git?p=~tnikolova/libiwpm-1.0.0/.git;a=summary The iWARP driver (port mapper client) sends to the IWPM service the local IP address and TCP port it has received from the RDMA application, when starting a connection. The IWPM service performs a socket bind from user space to get an available TCP port, called a mapped port, and communicates it back to the client. In that sense, the IWPM service is used to map the TCP port, which the RDMA application uses to any port available from the host TCP port space. The mapped ports are used in iWARP RDMA connections to avoid collisions with native TCP stack which is aware that these ports are taken. When an RDMA connection using a mapped port is terminated, the client notifies the IWPM service, which then releases the TCP port. The message exchange between the IWPM service and the iWARP drivers (between user space and kernel space) is implemented using netlink sockets. 1) Netlink interface functions are added: ibnl_unicast() and ibnl_mulitcast() for sending netlink messages to user space 2) The signature of the existing ibnl_put_msg() is changed to be more generic 3) Two netlink clients are added: RDMA_NL_NES, RDMA_NL_C4IW corresponding to the two iWarp drivers - nes and cxgb4 which use the IWPM service 4) Enums are added to enumerate the attributes in the netlink messages, which are exchanged between the user space IWPM service and the iWARP drivers Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com> Signed-off-by: Steve Wise <swise@opengridcomputing.com> Reviewed-by: PJ Waskiewicz <pj.waskiewicz@solidfire.com> [ Fold in range checking fixes and nlh_next removal as suggested by Dan Carpenter and Steve Wise. Fix sparse endianness in hash. - Roland ] Signed-off-by: Roland Dreier <roland@purestorage.com>
2014-03-27 06:07:35 +08:00
int len, int client, int op, int flags)
{
*nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), len, flags);
if (!*nlh)
return NULL;
return nlmsg_data(*nlh);
}
EXPORT_SYMBOL(ibnl_put_msg);
int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
int len, void *data, int type)
{
if (nla_put(skb, type, len, data)) {
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
return 0;
}
EXPORT_SYMBOL(ibnl_put_attr);
static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
int type = nlh->nlmsg_type;
unsigned int index = RDMA_NL_GET_CLIENT(type);
unsigned int op = RDMA_NL_GET_OP(type);
const struct rdma_nl_cbs *cb_table;
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
int err = -EINVAL;
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
if (!is_nl_msg_valid(index, op))
return -EINVAL;
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
down_read(&rdma_nl_types[index].sem);
cb_table = get_cb_table(skb, index, op);
if (!cb_table)
goto done;
if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) &&
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
!netlink_capable(skb, CAP_NET_ADMIN)) {
err = -EPERM;
goto done;
}
RDMA/netlink: OOPs in rdma_nl_rcv_msg() from misinterpreted flag rdma_nl_rcv_msg() checks to see if it should use the .dump() callback or the .doit() callback. The check is done with this check: if (flags & NLM_F_DUMP) ... The NLM_F_DUMP flag is two bits (NLM_F_ROOT | NLM_F_MATCH). When an RDMA_NL_LS message (response) is received, the bit used for indicating an error is the same bit as NLM_F_ROOT. NLM_F_ROOT == (0x100) == RDMA_NL_LS_F_ERR. ibacm sends a response with the RDMA_NL_LS_F_ERR bit set if an error occurs in the service. The current code then misinterprets the NLM_F_DUMP bit and trys to call the .dump() callback. If the .dump() callback for the specified request is not available (which is true for the RDMA_NL_LS messages) the following Oops occurs: [ 4555.960256] BUG: unable to handle kernel NULL pointer dereference at (null) [ 4555.969046] IP: (null) [ 4555.972664] PGD 10543f1067 P4D 10543f1067 PUD 1033f93067 PMD 0 [ 4555.979287] Oops: 0010 [#1] SMP [ 4555.982809] Modules linked in: rpcrdma ib_isert iscsi_target_mod target_core_mod ib_iser libiscsi scsi_transport_iscsi ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm dm_mirror dm_region_hash dm_log dm_mod dax sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel crypto_simd glue_helper cryptd hfi1 rdmavt iTCO_wdt iTCO_vendor_support ib_core mei_me lpc_ich pcspkr mei ioatdma sg shpchp i2c_i801 mfd_core wmi ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter acpi_pad nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables ext4 mbcache jbd2 sd_mod mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm igb ahci crc32c_intel ptp libahci pps_core drm dca libata i2c_algo_bit i2c_core [ 4556.061190] CPU: 54 PID: 9841 Comm: ibacm Tainted: G I 4.14.0-rc2+ #6 [ 4556.069667] Hardware name: Intel Corporation S2600WT2/S2600WT2, BIOS SE5C610.86B.01.01.0008.021120151325 02/11/2015 [ 4556.081339] task: ffff880855f42d00 task.stack: ffffc900246b4000 [ 4556.087967] RIP: 0010: (null) [ 4556.092166] RSP: 0018:ffffc900246b7bc8 EFLAGS: 00010246 [ 4556.098018] RAX: ffffffff81dbe9e0 RBX: ffff881058bb1000 RCX: 0000000000000000 [ 4556.105997] RDX: 0000000000001100 RSI: ffff881058bb1320 RDI: ffff881056362000 [ 4556.113984] RBP: ffffc900246b7bf8 R08: 0000000000000ec0 R09: 0000000000001100 [ 4556.121971] R10: ffff8810573a5000 R11: 0000000000000000 R12: ffff881056362000 [ 4556.129957] R13: 0000000000000ec0 R14: ffff881058bb1320 R15: 0000000000000ec0 [ 4556.137945] FS: 00007fe0ba5a38c0(0000) GS:ffff88105f080000(0000) knlGS:0000000000000000 [ 4556.147000] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4556.153433] CR2: 0000000000000000 CR3: 0000001056f5d003 CR4: 00000000001606e0 [ 4556.161419] Call Trace: [ 4556.164167] ? netlink_dump+0x12c/0x290 [ 4556.168468] __netlink_dump_start+0x186/0x1f0 [ 4556.173357] rdma_nl_rcv_msg+0x193/0x1b0 [ib_core] [ 4556.178724] rdma_nl_rcv+0xdc/0x130 [ib_core] [ 4556.183604] netlink_unicast+0x181/0x240 [ 4556.187998] netlink_sendmsg+0x2c2/0x3b0 [ 4556.192392] sock_sendmsg+0x38/0x50 [ 4556.196299] SYSC_sendto+0x102/0x190 [ 4556.200308] ? __audit_syscall_entry+0xaf/0x100 [ 4556.205387] ? syscall_trace_enter+0x1d0/0x2b0 [ 4556.210366] ? __audit_syscall_exit+0x209/0x290 [ 4556.215442] SyS_sendto+0xe/0x10 [ 4556.219060] do_syscall_64+0x67/0x1b0 [ 4556.223165] entry_SYSCALL64_slow_path+0x25/0x25 [ 4556.228328] RIP: 0033:0x7fe0b9db2a63 [ 4556.232333] RSP: 002b:00007ffc55edc260 EFLAGS: 00000293 ORIG_RAX: 000000000000002c [ 4556.240808] RAX: ffffffffffffffda RBX: 0000000000000010 RCX: 00007fe0b9db2a63 [ 4556.248796] RDX: 0000000000000010 RSI: 00007ffc55edc280 RDI: 000000000000000d [ 4556.256782] RBP: 00007ffc55edc670 R08: 00007ffc55edc270 R09: 000000000000000c [ 4556.265321] R10: 0000000000000000 R11: 0000000000000293 R12: 00007ffc55edc280 [ 4556.273846] R13: 000000000260b400 R14: 000000000000000d R15: 0000000000000001 [ 4556.282368] Code: Bad RIP value. [ 4556.286629] RIP: (null) RSP: ffffc900246b7bc8 [ 4556.293013] CR2: 0000000000000000 [ 4556.297292] ---[ end trace 8d67abcfd10ec209 ]--- [ 4556.305465] Kernel panic - not syncing: Fatal exception [ 4556.313786] Kernel Offset: disabled [ 4556.321563] ---[ end Kernel panic - not syncing: Fatal exception [ 4556.328960] ------------[ cut here ]------------ Special case RDMA_NL_LS response messages to call the appropriate callback. Additionally, make sure that the .dump() callback is not NULL before calling it. Fixes: 647c75ac59a48a54 ("RDMA/netlink: Convert LS to doit callback") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Kaike Wan <kaike.wan@intel.com> Reviewed-by: Alex Estrin <alex.estrin@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Reviewed-by: Shiraz Saleem <shiraz.saleem@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-10-24 20:41:01 +08:00
/*
* LS responses overload the 0x100 (NLM_F_ROOT) flag. Don't
* mistakenly call the .dump() function.
*/
if (index == RDMA_NL_LS) {
if (cb_table[op].doit)
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
err = cb_table[op].doit(skb, nlh, extack);
goto done;
RDMA/netlink: OOPs in rdma_nl_rcv_msg() from misinterpreted flag rdma_nl_rcv_msg() checks to see if it should use the .dump() callback or the .doit() callback. The check is done with this check: if (flags & NLM_F_DUMP) ... The NLM_F_DUMP flag is two bits (NLM_F_ROOT | NLM_F_MATCH). When an RDMA_NL_LS message (response) is received, the bit used for indicating an error is the same bit as NLM_F_ROOT. NLM_F_ROOT == (0x100) == RDMA_NL_LS_F_ERR. ibacm sends a response with the RDMA_NL_LS_F_ERR bit set if an error occurs in the service. The current code then misinterprets the NLM_F_DUMP bit and trys to call the .dump() callback. If the .dump() callback for the specified request is not available (which is true for the RDMA_NL_LS messages) the following Oops occurs: [ 4555.960256] BUG: unable to handle kernel NULL pointer dereference at (null) [ 4555.969046] IP: (null) [ 4555.972664] PGD 10543f1067 P4D 10543f1067 PUD 1033f93067 PMD 0 [ 4555.979287] Oops: 0010 [#1] SMP [ 4555.982809] Modules linked in: rpcrdma ib_isert iscsi_target_mod target_core_mod ib_iser libiscsi scsi_transport_iscsi ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm dm_mirror dm_region_hash dm_log dm_mod dax sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel crypto_simd glue_helper cryptd hfi1 rdmavt iTCO_wdt iTCO_vendor_support ib_core mei_me lpc_ich pcspkr mei ioatdma sg shpchp i2c_i801 mfd_core wmi ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter acpi_pad nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables ext4 mbcache jbd2 sd_mod mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm igb ahci crc32c_intel ptp libahci pps_core drm dca libata i2c_algo_bit i2c_core [ 4556.061190] CPU: 54 PID: 9841 Comm: ibacm Tainted: G I 4.14.0-rc2+ #6 [ 4556.069667] Hardware name: Intel Corporation S2600WT2/S2600WT2, BIOS SE5C610.86B.01.01.0008.021120151325 02/11/2015 [ 4556.081339] task: ffff880855f42d00 task.stack: ffffc900246b4000 [ 4556.087967] RIP: 0010: (null) [ 4556.092166] RSP: 0018:ffffc900246b7bc8 EFLAGS: 00010246 [ 4556.098018] RAX: ffffffff81dbe9e0 RBX: ffff881058bb1000 RCX: 0000000000000000 [ 4556.105997] RDX: 0000000000001100 RSI: ffff881058bb1320 RDI: ffff881056362000 [ 4556.113984] RBP: ffffc900246b7bf8 R08: 0000000000000ec0 R09: 0000000000001100 [ 4556.121971] R10: ffff8810573a5000 R11: 0000000000000000 R12: ffff881056362000 [ 4556.129957] R13: 0000000000000ec0 R14: ffff881058bb1320 R15: 0000000000000ec0 [ 4556.137945] FS: 00007fe0ba5a38c0(0000) GS:ffff88105f080000(0000) knlGS:0000000000000000 [ 4556.147000] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4556.153433] CR2: 0000000000000000 CR3: 0000001056f5d003 CR4: 00000000001606e0 [ 4556.161419] Call Trace: [ 4556.164167] ? netlink_dump+0x12c/0x290 [ 4556.168468] __netlink_dump_start+0x186/0x1f0 [ 4556.173357] rdma_nl_rcv_msg+0x193/0x1b0 [ib_core] [ 4556.178724] rdma_nl_rcv+0xdc/0x130 [ib_core] [ 4556.183604] netlink_unicast+0x181/0x240 [ 4556.187998] netlink_sendmsg+0x2c2/0x3b0 [ 4556.192392] sock_sendmsg+0x38/0x50 [ 4556.196299] SYSC_sendto+0x102/0x190 [ 4556.200308] ? __audit_syscall_entry+0xaf/0x100 [ 4556.205387] ? syscall_trace_enter+0x1d0/0x2b0 [ 4556.210366] ? __audit_syscall_exit+0x209/0x290 [ 4556.215442] SyS_sendto+0xe/0x10 [ 4556.219060] do_syscall_64+0x67/0x1b0 [ 4556.223165] entry_SYSCALL64_slow_path+0x25/0x25 [ 4556.228328] RIP: 0033:0x7fe0b9db2a63 [ 4556.232333] RSP: 002b:00007ffc55edc260 EFLAGS: 00000293 ORIG_RAX: 000000000000002c [ 4556.240808] RAX: ffffffffffffffda RBX: 0000000000000010 RCX: 00007fe0b9db2a63 [ 4556.248796] RDX: 0000000000000010 RSI: 00007ffc55edc280 RDI: 000000000000000d [ 4556.256782] RBP: 00007ffc55edc670 R08: 00007ffc55edc270 R09: 000000000000000c [ 4556.265321] R10: 0000000000000000 R11: 0000000000000293 R12: 00007ffc55edc280 [ 4556.273846] R13: 000000000260b400 R14: 000000000000000d R15: 0000000000000001 [ 4556.282368] Code: Bad RIP value. [ 4556.286629] RIP: (null) RSP: ffffc900246b7bc8 [ 4556.293013] CR2: 0000000000000000 [ 4556.297292] ---[ end trace 8d67abcfd10ec209 ]--- [ 4556.305465] Kernel panic - not syncing: Fatal exception [ 4556.313786] Kernel Offset: disabled [ 4556.321563] ---[ end Kernel panic - not syncing: Fatal exception [ 4556.328960] ------------[ cut here ]------------ Special case RDMA_NL_LS response messages to call the appropriate callback. Additionally, make sure that the .dump() callback is not NULL before calling it. Fixes: 647c75ac59a48a54 ("RDMA/netlink: Convert LS to doit callback") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Kaike Wan <kaike.wan@intel.com> Reviewed-by: Alex Estrin <alex.estrin@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Reviewed-by: Shiraz Saleem <shiraz.saleem@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-10-24 20:41:01 +08:00
}
/* FIXME: Convert IWCM to properly handle doit callbacks */
if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) {
struct netlink_dump_control c = {
.dump = cb_table[op].dump,
};
RDMA/netlink: OOPs in rdma_nl_rcv_msg() from misinterpreted flag rdma_nl_rcv_msg() checks to see if it should use the .dump() callback or the .doit() callback. The check is done with this check: if (flags & NLM_F_DUMP) ... The NLM_F_DUMP flag is two bits (NLM_F_ROOT | NLM_F_MATCH). When an RDMA_NL_LS message (response) is received, the bit used for indicating an error is the same bit as NLM_F_ROOT. NLM_F_ROOT == (0x100) == RDMA_NL_LS_F_ERR. ibacm sends a response with the RDMA_NL_LS_F_ERR bit set if an error occurs in the service. The current code then misinterprets the NLM_F_DUMP bit and trys to call the .dump() callback. If the .dump() callback for the specified request is not available (which is true for the RDMA_NL_LS messages) the following Oops occurs: [ 4555.960256] BUG: unable to handle kernel NULL pointer dereference at (null) [ 4555.969046] IP: (null) [ 4555.972664] PGD 10543f1067 P4D 10543f1067 PUD 1033f93067 PMD 0 [ 4555.979287] Oops: 0010 [#1] SMP [ 4555.982809] Modules linked in: rpcrdma ib_isert iscsi_target_mod target_core_mod ib_iser libiscsi scsi_transport_iscsi ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm dm_mirror dm_region_hash dm_log dm_mod dax sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel crypto_simd glue_helper cryptd hfi1 rdmavt iTCO_wdt iTCO_vendor_support ib_core mei_me lpc_ich pcspkr mei ioatdma sg shpchp i2c_i801 mfd_core wmi ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter acpi_pad nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables ext4 mbcache jbd2 sd_mod mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm igb ahci crc32c_intel ptp libahci pps_core drm dca libata i2c_algo_bit i2c_core [ 4556.061190] CPU: 54 PID: 9841 Comm: ibacm Tainted: G I 4.14.0-rc2+ #6 [ 4556.069667] Hardware name: Intel Corporation S2600WT2/S2600WT2, BIOS SE5C610.86B.01.01.0008.021120151325 02/11/2015 [ 4556.081339] task: ffff880855f42d00 task.stack: ffffc900246b4000 [ 4556.087967] RIP: 0010: (null) [ 4556.092166] RSP: 0018:ffffc900246b7bc8 EFLAGS: 00010246 [ 4556.098018] RAX: ffffffff81dbe9e0 RBX: ffff881058bb1000 RCX: 0000000000000000 [ 4556.105997] RDX: 0000000000001100 RSI: ffff881058bb1320 RDI: ffff881056362000 [ 4556.113984] RBP: ffffc900246b7bf8 R08: 0000000000000ec0 R09: 0000000000001100 [ 4556.121971] R10: ffff8810573a5000 R11: 0000000000000000 R12: ffff881056362000 [ 4556.129957] R13: 0000000000000ec0 R14: ffff881058bb1320 R15: 0000000000000ec0 [ 4556.137945] FS: 00007fe0ba5a38c0(0000) GS:ffff88105f080000(0000) knlGS:0000000000000000 [ 4556.147000] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4556.153433] CR2: 0000000000000000 CR3: 0000001056f5d003 CR4: 00000000001606e0 [ 4556.161419] Call Trace: [ 4556.164167] ? netlink_dump+0x12c/0x290 [ 4556.168468] __netlink_dump_start+0x186/0x1f0 [ 4556.173357] rdma_nl_rcv_msg+0x193/0x1b0 [ib_core] [ 4556.178724] rdma_nl_rcv+0xdc/0x130 [ib_core] [ 4556.183604] netlink_unicast+0x181/0x240 [ 4556.187998] netlink_sendmsg+0x2c2/0x3b0 [ 4556.192392] sock_sendmsg+0x38/0x50 [ 4556.196299] SYSC_sendto+0x102/0x190 [ 4556.200308] ? __audit_syscall_entry+0xaf/0x100 [ 4556.205387] ? syscall_trace_enter+0x1d0/0x2b0 [ 4556.210366] ? __audit_syscall_exit+0x209/0x290 [ 4556.215442] SyS_sendto+0xe/0x10 [ 4556.219060] do_syscall_64+0x67/0x1b0 [ 4556.223165] entry_SYSCALL64_slow_path+0x25/0x25 [ 4556.228328] RIP: 0033:0x7fe0b9db2a63 [ 4556.232333] RSP: 002b:00007ffc55edc260 EFLAGS: 00000293 ORIG_RAX: 000000000000002c [ 4556.240808] RAX: ffffffffffffffda RBX: 0000000000000010 RCX: 00007fe0b9db2a63 [ 4556.248796] RDX: 0000000000000010 RSI: 00007ffc55edc280 RDI: 000000000000000d [ 4556.256782] RBP: 00007ffc55edc670 R08: 00007ffc55edc270 R09: 000000000000000c [ 4556.265321] R10: 0000000000000000 R11: 0000000000000293 R12: 00007ffc55edc280 [ 4556.273846] R13: 000000000260b400 R14: 000000000000000d R15: 0000000000000001 [ 4556.282368] Code: Bad RIP value. [ 4556.286629] RIP: (null) RSP: ffffc900246b7bc8 [ 4556.293013] CR2: 0000000000000000 [ 4556.297292] ---[ end trace 8d67abcfd10ec209 ]--- [ 4556.305465] Kernel panic - not syncing: Fatal exception [ 4556.313786] Kernel Offset: disabled [ 4556.321563] ---[ end Kernel panic - not syncing: Fatal exception [ 4556.328960] ------------[ cut here ]------------ Special case RDMA_NL_LS response messages to call the appropriate callback. Additionally, make sure that the .dump() callback is not NULL before calling it. Fixes: 647c75ac59a48a54 ("RDMA/netlink: Convert LS to doit callback") Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Kaike Wan <kaike.wan@intel.com> Reviewed-by: Alex Estrin <alex.estrin@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Reviewed-by: Shiraz Saleem <shiraz.saleem@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-10-24 20:41:01 +08:00
if (c.dump)
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
err = netlink_dump_start(skb->sk, skb, nlh, &c);
goto done;
}
if (cb_table[op].doit)
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
err = cb_table[op].doit(skb, nlh, extack);
done:
up_read(&rdma_nl_types[index].sem);
return err;
}
/*
* This function is similar to netlink_rcv_skb with one exception:
* It calls to the callback for the netlink messages without NLM_F_REQUEST
* flag. These messages are intended for RDMA_NL_LS consumer, so it is allowed
* for that consumer only.
*/
static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
struct nlmsghdr *,
struct netlink_ext_ack *))
{
struct netlink_ext_ack extack = {};
struct nlmsghdr *nlh;
int err;
while (skb->len >= nlmsg_total_size(0)) {
int msglen;
nlh = nlmsg_hdr(skb);
err = 0;
if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
return 0;
/*
* Generally speaking, the only requests are handled
* by the kernel, but RDMA_NL_LS is different, because it
* runs backward netlink scheme. Kernel initiates messages
* and waits for reply with data to keep pathrecord cache
* in sync.
*/
if (!(nlh->nlmsg_flags & NLM_F_REQUEST) &&
(RDMA_NL_GET_CLIENT(nlh->nlmsg_type) != RDMA_NL_LS))
goto ack;
/* Skip control messages */
if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
goto ack;
err = cb(skb, nlh, &extack);
if (err == -EINTR)
goto skip;
ack:
if (nlh->nlmsg_flags & NLM_F_ACK || err)
netlink_ack(skb, nlh, err, &extack);
skip:
msglen = NLMSG_ALIGN(nlh->nlmsg_len);
if (msglen > skb->len)
msglen = skb->len;
skb_pull(skb, msglen);
}
return 0;
}
static void rdma_nl_rcv(struct sk_buff *skb)
{
rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg);
}
int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid)
RDMA/core: Add support for iWARP Port Mapper user space service This patch adds iWARP Port Mapper (IWPM) Version 2 support. The iWARP Port Mapper implementation is based on the port mapper specification section in the Sockets Direct Protocol paper - http://www.rdmaconsortium.org/home/draft-pinkerton-iwarp-sdp-v1.0.pdf Existing iWARP RDMA providers use the same IP address as the native TCP/IP stack when creating RDMA connections. They need a mechanism to claim the TCP ports used for RDMA connections to prevent TCP port collisions when other host applications use TCP ports. The iWARP Port Mapper provides a standard mechanism to accomplish this. Without this service it is possible for RDMA application to bind/listen on the same port which is already being used by native TCP host application. If that happens the incoming TCP connection data can be passed to the RDMA stack with error. The iWARP Port Mapper solution doesn't contain any changes to the existing network stack in the kernel space. All the changes are contained with the infiniband tree and also in user space. The iWARP Port Mapper service is implemented as a user space daemon process. Source for the IWPM service is located at http://git.openfabrics.org/git?p=~tnikolova/libiwpm-1.0.0/.git;a=summary The iWARP driver (port mapper client) sends to the IWPM service the local IP address and TCP port it has received from the RDMA application, when starting a connection. The IWPM service performs a socket bind from user space to get an available TCP port, called a mapped port, and communicates it back to the client. In that sense, the IWPM service is used to map the TCP port, which the RDMA application uses to any port available from the host TCP port space. The mapped ports are used in iWARP RDMA connections to avoid collisions with native TCP stack which is aware that these ports are taken. When an RDMA connection using a mapped port is terminated, the client notifies the IWPM service, which then releases the TCP port. The message exchange between the IWPM service and the iWARP drivers (between user space and kernel space) is implemented using netlink sockets. 1) Netlink interface functions are added: ibnl_unicast() and ibnl_mulitcast() for sending netlink messages to user space 2) The signature of the existing ibnl_put_msg() is changed to be more generic 3) Two netlink clients are added: RDMA_NL_NES, RDMA_NL_C4IW corresponding to the two iWarp drivers - nes and cxgb4 which use the IWPM service 4) Enums are added to enumerate the attributes in the netlink messages, which are exchanged between the user space IWPM service and the iWARP drivers Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com> Signed-off-by: Steve Wise <swise@opengridcomputing.com> Reviewed-by: PJ Waskiewicz <pj.waskiewicz@solidfire.com> [ Fold in range checking fixes and nlh_next removal as suggested by Dan Carpenter and Steve Wise. Fix sparse endianness in hash. - Roland ] Signed-off-by: Roland Dreier <roland@purestorage.com>
2014-03-27 06:07:35 +08:00
{
struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
int err;
err = netlink_unicast(rnet->nl_sock, skb, pid, MSG_DONTWAIT);
return (err < 0) ? err : 0;
RDMA/core: Add support for iWARP Port Mapper user space service This patch adds iWARP Port Mapper (IWPM) Version 2 support. The iWARP Port Mapper implementation is based on the port mapper specification section in the Sockets Direct Protocol paper - http://www.rdmaconsortium.org/home/draft-pinkerton-iwarp-sdp-v1.0.pdf Existing iWARP RDMA providers use the same IP address as the native TCP/IP stack when creating RDMA connections. They need a mechanism to claim the TCP ports used for RDMA connections to prevent TCP port collisions when other host applications use TCP ports. The iWARP Port Mapper provides a standard mechanism to accomplish this. Without this service it is possible for RDMA application to bind/listen on the same port which is already being used by native TCP host application. If that happens the incoming TCP connection data can be passed to the RDMA stack with error. The iWARP Port Mapper solution doesn't contain any changes to the existing network stack in the kernel space. All the changes are contained with the infiniband tree and also in user space. The iWARP Port Mapper service is implemented as a user space daemon process. Source for the IWPM service is located at http://git.openfabrics.org/git?p=~tnikolova/libiwpm-1.0.0/.git;a=summary The iWARP driver (port mapper client) sends to the IWPM service the local IP address and TCP port it has received from the RDMA application, when starting a connection. The IWPM service performs a socket bind from user space to get an available TCP port, called a mapped port, and communicates it back to the client. In that sense, the IWPM service is used to map the TCP port, which the RDMA application uses to any port available from the host TCP port space. The mapped ports are used in iWARP RDMA connections to avoid collisions with native TCP stack which is aware that these ports are taken. When an RDMA connection using a mapped port is terminated, the client notifies the IWPM service, which then releases the TCP port. The message exchange between the IWPM service and the iWARP drivers (between user space and kernel space) is implemented using netlink sockets. 1) Netlink interface functions are added: ibnl_unicast() and ibnl_mulitcast() for sending netlink messages to user space 2) The signature of the existing ibnl_put_msg() is changed to be more generic 3) Two netlink clients are added: RDMA_NL_NES, RDMA_NL_C4IW corresponding to the two iWarp drivers - nes and cxgb4 which use the IWPM service 4) Enums are added to enumerate the attributes in the netlink messages, which are exchanged between the user space IWPM service and the iWARP drivers Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com> Signed-off-by: Steve Wise <swise@opengridcomputing.com> Reviewed-by: PJ Waskiewicz <pj.waskiewicz@solidfire.com> [ Fold in range checking fixes and nlh_next removal as suggested by Dan Carpenter and Steve Wise. Fix sparse endianness in hash. - Roland ] Signed-off-by: Roland Dreier <roland@purestorage.com>
2014-03-27 06:07:35 +08:00
}
EXPORT_SYMBOL(rdma_nl_unicast);
RDMA/core: Add support for iWARP Port Mapper user space service This patch adds iWARP Port Mapper (IWPM) Version 2 support. The iWARP Port Mapper implementation is based on the port mapper specification section in the Sockets Direct Protocol paper - http://www.rdmaconsortium.org/home/draft-pinkerton-iwarp-sdp-v1.0.pdf Existing iWARP RDMA providers use the same IP address as the native TCP/IP stack when creating RDMA connections. They need a mechanism to claim the TCP ports used for RDMA connections to prevent TCP port collisions when other host applications use TCP ports. The iWARP Port Mapper provides a standard mechanism to accomplish this. Without this service it is possible for RDMA application to bind/listen on the same port which is already being used by native TCP host application. If that happens the incoming TCP connection data can be passed to the RDMA stack with error. The iWARP Port Mapper solution doesn't contain any changes to the existing network stack in the kernel space. All the changes are contained with the infiniband tree and also in user space. The iWARP Port Mapper service is implemented as a user space daemon process. Source for the IWPM service is located at http://git.openfabrics.org/git?p=~tnikolova/libiwpm-1.0.0/.git;a=summary The iWARP driver (port mapper client) sends to the IWPM service the local IP address and TCP port it has received from the RDMA application, when starting a connection. The IWPM service performs a socket bind from user space to get an available TCP port, called a mapped port, and communicates it back to the client. In that sense, the IWPM service is used to map the TCP port, which the RDMA application uses to any port available from the host TCP port space. The mapped ports are used in iWARP RDMA connections to avoid collisions with native TCP stack which is aware that these ports are taken. When an RDMA connection using a mapped port is terminated, the client notifies the IWPM service, which then releases the TCP port. The message exchange between the IWPM service and the iWARP drivers (between user space and kernel space) is implemented using netlink sockets. 1) Netlink interface functions are added: ibnl_unicast() and ibnl_mulitcast() for sending netlink messages to user space 2) The signature of the existing ibnl_put_msg() is changed to be more generic 3) Two netlink clients are added: RDMA_NL_NES, RDMA_NL_C4IW corresponding to the two iWarp drivers - nes and cxgb4 which use the IWPM service 4) Enums are added to enumerate the attributes in the netlink messages, which are exchanged between the user space IWPM service and the iWARP drivers Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com> Signed-off-by: Steve Wise <swise@opengridcomputing.com> Reviewed-by: PJ Waskiewicz <pj.waskiewicz@solidfire.com> [ Fold in range checking fixes and nlh_next removal as suggested by Dan Carpenter and Steve Wise. Fix sparse endianness in hash. - Roland ] Signed-off-by: Roland Dreier <roland@purestorage.com>
2014-03-27 06:07:35 +08:00
int rdma_nl_unicast_wait(struct net *net, struct sk_buff *skb, __u32 pid)
{
struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
int err;
err = netlink_unicast(rnet->nl_sock, skb, pid, 0);
return (err < 0) ? err : 0;
}
EXPORT_SYMBOL(rdma_nl_unicast_wait);
int rdma_nl_multicast(struct net *net, struct sk_buff *skb,
unsigned int group, gfp_t flags)
RDMA/core: Add support for iWARP Port Mapper user space service This patch adds iWARP Port Mapper (IWPM) Version 2 support. The iWARP Port Mapper implementation is based on the port mapper specification section in the Sockets Direct Protocol paper - http://www.rdmaconsortium.org/home/draft-pinkerton-iwarp-sdp-v1.0.pdf Existing iWARP RDMA providers use the same IP address as the native TCP/IP stack when creating RDMA connections. They need a mechanism to claim the TCP ports used for RDMA connections to prevent TCP port collisions when other host applications use TCP ports. The iWARP Port Mapper provides a standard mechanism to accomplish this. Without this service it is possible for RDMA application to bind/listen on the same port which is already being used by native TCP host application. If that happens the incoming TCP connection data can be passed to the RDMA stack with error. The iWARP Port Mapper solution doesn't contain any changes to the existing network stack in the kernel space. All the changes are contained with the infiniband tree and also in user space. The iWARP Port Mapper service is implemented as a user space daemon process. Source for the IWPM service is located at http://git.openfabrics.org/git?p=~tnikolova/libiwpm-1.0.0/.git;a=summary The iWARP driver (port mapper client) sends to the IWPM service the local IP address and TCP port it has received from the RDMA application, when starting a connection. The IWPM service performs a socket bind from user space to get an available TCP port, called a mapped port, and communicates it back to the client. In that sense, the IWPM service is used to map the TCP port, which the RDMA application uses to any port available from the host TCP port space. The mapped ports are used in iWARP RDMA connections to avoid collisions with native TCP stack which is aware that these ports are taken. When an RDMA connection using a mapped port is terminated, the client notifies the IWPM service, which then releases the TCP port. The message exchange between the IWPM service and the iWARP drivers (between user space and kernel space) is implemented using netlink sockets. 1) Netlink interface functions are added: ibnl_unicast() and ibnl_mulitcast() for sending netlink messages to user space 2) The signature of the existing ibnl_put_msg() is changed to be more generic 3) Two netlink clients are added: RDMA_NL_NES, RDMA_NL_C4IW corresponding to the two iWarp drivers - nes and cxgb4 which use the IWPM service 4) Enums are added to enumerate the attributes in the netlink messages, which are exchanged between the user space IWPM service and the iWARP drivers Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com> Signed-off-by: Steve Wise <swise@opengridcomputing.com> Reviewed-by: PJ Waskiewicz <pj.waskiewicz@solidfire.com> [ Fold in range checking fixes and nlh_next removal as suggested by Dan Carpenter and Steve Wise. Fix sparse endianness in hash. - Roland ] Signed-off-by: Roland Dreier <roland@purestorage.com>
2014-03-27 06:07:35 +08:00
{
struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
return nlmsg_multicast(rnet->nl_sock, skb, 0, group, flags);
RDMA/core: Add support for iWARP Port Mapper user space service This patch adds iWARP Port Mapper (IWPM) Version 2 support. The iWARP Port Mapper implementation is based on the port mapper specification section in the Sockets Direct Protocol paper - http://www.rdmaconsortium.org/home/draft-pinkerton-iwarp-sdp-v1.0.pdf Existing iWARP RDMA providers use the same IP address as the native TCP/IP stack when creating RDMA connections. They need a mechanism to claim the TCP ports used for RDMA connections to prevent TCP port collisions when other host applications use TCP ports. The iWARP Port Mapper provides a standard mechanism to accomplish this. Without this service it is possible for RDMA application to bind/listen on the same port which is already being used by native TCP host application. If that happens the incoming TCP connection data can be passed to the RDMA stack with error. The iWARP Port Mapper solution doesn't contain any changes to the existing network stack in the kernel space. All the changes are contained with the infiniband tree and also in user space. The iWARP Port Mapper service is implemented as a user space daemon process. Source for the IWPM service is located at http://git.openfabrics.org/git?p=~tnikolova/libiwpm-1.0.0/.git;a=summary The iWARP driver (port mapper client) sends to the IWPM service the local IP address and TCP port it has received from the RDMA application, when starting a connection. The IWPM service performs a socket bind from user space to get an available TCP port, called a mapped port, and communicates it back to the client. In that sense, the IWPM service is used to map the TCP port, which the RDMA application uses to any port available from the host TCP port space. The mapped ports are used in iWARP RDMA connections to avoid collisions with native TCP stack which is aware that these ports are taken. When an RDMA connection using a mapped port is terminated, the client notifies the IWPM service, which then releases the TCP port. The message exchange between the IWPM service and the iWARP drivers (between user space and kernel space) is implemented using netlink sockets. 1) Netlink interface functions are added: ibnl_unicast() and ibnl_mulitcast() for sending netlink messages to user space 2) The signature of the existing ibnl_put_msg() is changed to be more generic 3) Two netlink clients are added: RDMA_NL_NES, RDMA_NL_C4IW corresponding to the two iWarp drivers - nes and cxgb4 which use the IWPM service 4) Enums are added to enumerate the attributes in the netlink messages, which are exchanged between the user space IWPM service and the iWARP drivers Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com> Signed-off-by: Steve Wise <swise@opengridcomputing.com> Reviewed-by: PJ Waskiewicz <pj.waskiewicz@solidfire.com> [ Fold in range checking fixes and nlh_next removal as suggested by Dan Carpenter and Steve Wise. Fix sparse endianness in hash. - Roland ] Signed-off-by: Roland Dreier <roland@purestorage.com>
2014-03-27 06:07:35 +08:00
}
EXPORT_SYMBOL(rdma_nl_multicast);
RDMA/core: Add support for iWARP Port Mapper user space service This patch adds iWARP Port Mapper (IWPM) Version 2 support. The iWARP Port Mapper implementation is based on the port mapper specification section in the Sockets Direct Protocol paper - http://www.rdmaconsortium.org/home/draft-pinkerton-iwarp-sdp-v1.0.pdf Existing iWARP RDMA providers use the same IP address as the native TCP/IP stack when creating RDMA connections. They need a mechanism to claim the TCP ports used for RDMA connections to prevent TCP port collisions when other host applications use TCP ports. The iWARP Port Mapper provides a standard mechanism to accomplish this. Without this service it is possible for RDMA application to bind/listen on the same port which is already being used by native TCP host application. If that happens the incoming TCP connection data can be passed to the RDMA stack with error. The iWARP Port Mapper solution doesn't contain any changes to the existing network stack in the kernel space. All the changes are contained with the infiniband tree and also in user space. The iWARP Port Mapper service is implemented as a user space daemon process. Source for the IWPM service is located at http://git.openfabrics.org/git?p=~tnikolova/libiwpm-1.0.0/.git;a=summary The iWARP driver (port mapper client) sends to the IWPM service the local IP address and TCP port it has received from the RDMA application, when starting a connection. The IWPM service performs a socket bind from user space to get an available TCP port, called a mapped port, and communicates it back to the client. In that sense, the IWPM service is used to map the TCP port, which the RDMA application uses to any port available from the host TCP port space. The mapped ports are used in iWARP RDMA connections to avoid collisions with native TCP stack which is aware that these ports are taken. When an RDMA connection using a mapped port is terminated, the client notifies the IWPM service, which then releases the TCP port. The message exchange between the IWPM service and the iWARP drivers (between user space and kernel space) is implemented using netlink sockets. 1) Netlink interface functions are added: ibnl_unicast() and ibnl_mulitcast() for sending netlink messages to user space 2) The signature of the existing ibnl_put_msg() is changed to be more generic 3) Two netlink clients are added: RDMA_NL_NES, RDMA_NL_C4IW corresponding to the two iWarp drivers - nes and cxgb4 which use the IWPM service 4) Enums are added to enumerate the attributes in the netlink messages, which are exchanged between the user space IWPM service and the iWARP drivers Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com> Signed-off-by: Steve Wise <swise@opengridcomputing.com> Reviewed-by: PJ Waskiewicz <pj.waskiewicz@solidfire.com> [ Fold in range checking fixes and nlh_next removal as suggested by Dan Carpenter and Steve Wise. Fix sparse endianness in hash. - Roland ] Signed-off-by: Roland Dreier <roland@purestorage.com>
2014-03-27 06:07:35 +08:00
IB/core: Avoid deadlock during netlink message handling When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking of rdma_nl_mutex with the below call sequence. [..] rdma_nl_rcv() mutex_lock() [..] rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Split the lock protecting the cb_table into a per-index lock, and make it a rwlock. This lock is used to ensure no callbacks are running after unregistration returns. Since a module will not be registered once it is already running callbacks, this avoids the deadlock. 2. Use smp_store_release() to update the cb_table during registration so that no lock is required. This avoids lockdep problems with thinking all the rwsems are the same lock class. Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.org Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-10-15 16:07:33 +08:00
void rdma_nl_init(void)
{
int idx;
for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++)
init_rwsem(&rdma_nl_types[idx].sem);
}
void rdma_nl_exit(void)
{
int idx;
for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++)
WARN(rdma_nl_types[idx].cb_table,
"Netlink client %d wasn't released prior to unloading %s\n",
idx, KBUILD_MODNAME);
}
int rdma_nl_net_init(struct rdma_dev_net *rnet)
{
struct net *net = read_pnet(&rnet->net);
struct netlink_kernel_cfg cfg = {
.input = rdma_nl_rcv,
};
struct sock *nls;
nls = netlink_kernel_create(net, NETLINK_RDMA, &cfg);
if (!nls)
return -ENOMEM;
nls->sk_sndtimeo = 10 * HZ;
rnet->nl_sock = nls;
return 0;
}
void rdma_nl_net_exit(struct rdma_dev_net *rnet)
{
netlink_kernel_release(rnet->nl_sock);
}
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA);