net/smc: fix kernel panic caused by race of smc_sock

A crash occurs when smc_cdc_tx_handler() tries to access smc_sock
but smc_release() has already freed it.

[ 4570.695099] BUG: unable to handle page fault for address: 000000002eae9e88
[ 4570.696048] #PF: supervisor write access in kernel mode
[ 4570.696728] #PF: error_code(0x0002) - not-present page
[ 4570.697401] PGD 0 P4D 0
[ 4570.697716] Oops: 0002 [#1] PREEMPT SMP NOPTI
[ 4570.698228] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.16.0-rc4+ #111
[ 4570.699013] Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 8c24b4c 04/0
[ 4570.699933] RIP: 0010:_raw_spin_lock+0x1a/0x30
<...>
[ 4570.711446] Call Trace:
[ 4570.711746]  <IRQ>
[ 4570.711992]  smc_cdc_tx_handler+0x41/0xc0
[ 4570.712470]  smc_wr_tx_tasklet_fn+0x213/0x560
[ 4570.712981]  ? smc_cdc_tx_dismisser+0x10/0x10
[ 4570.713489]  tasklet_action_common.isra.17+0x66/0x140
[ 4570.714083]  __do_softirq+0x123/0x2f4
[ 4570.714521]  irq_exit_rcu+0xc4/0xf0
[ 4570.714934]  common_interrupt+0xba/0xe0

Though smc_cdc_tx_handler() checked the existence of smc connection,
smc_release() may have already dismissed and released the smc socket
before smc_cdc_tx_handler() further visits it.

smc_cdc_tx_handler()           |smc_release()
if (!conn)                     |
                               |
                               |smc_cdc_tx_dismiss_slots()
                               |      smc_cdc_tx_dismisser()
                               |
                               |sock_put(&smc->sk) <- last sock_put,
                               |                      smc_sock freed
bh_lock_sock(&smc->sk) (panic) |

To make sure we won't receive any CDC messages after we free the
smc_sock, add a refcount on the smc_connection for inflight CDC
message(posted to the QP but haven't received related CQE), and
don't release the smc_connection until all the inflight CDC messages
haven been done, for both success or failed ones.

Using refcount on CDC messages brings another problem: when the link
is going to be destroyed, smcr_link_clear() will reset the QP, which
then remove all the pending CQEs related to the QP in the CQ. To make
sure all the CQEs will always come back so the refcount on the
smc_connection can always reach 0, smc_ib_modify_qp_reset() was replaced
by smc_ib_modify_qp_error().
And remove the timeout in smc_wr_tx_wait_no_pending_sends() since we
need to wait for all pending WQEs done, or we may encounter use-after-
free when handling CQEs.

For IB device removal routine, we need to wait for all the QPs on that
device been destroyed before we can destroy CQs on the device, or
the refcount on smc_connection won't reach 0 and smc_sock cannot be
released.

Fixes: 5f08318f61 ("smc: connection data control (CDC)")
Reported-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Dust Li <dust.li@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Dust Li 2021-12-28 17:03:25 +08:00 committed by David S. Miller
parent 90cee52f2e
commit 349d43127d
8 changed files with 57 additions and 76 deletions

View File

@ -180,6 +180,11 @@ struct smc_connection {
u16 tx_cdc_seq; /* sequence # for CDC send */ u16 tx_cdc_seq; /* sequence # for CDC send */
u16 tx_cdc_seq_fin; /* sequence # - tx completed */ u16 tx_cdc_seq_fin; /* sequence # - tx completed */
spinlock_t send_lock; /* protect wr_sends */ spinlock_t send_lock; /* protect wr_sends */
atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe
* - inc when post wqe,
* - dec on polled tx cqe
*/
wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
u32 tx_off; /* base offset in peer rmb */ u32 tx_off; /* base offset in peer rmb */

View File

@ -31,10 +31,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
struct smc_sock *smc; struct smc_sock *smc;
int diff; int diff;
if (!conn)
/* already dismissed */
return;
smc = container_of(conn, struct smc_sock, conn); smc = container_of(conn, struct smc_sock, conn);
bh_lock_sock(&smc->sk); bh_lock_sock(&smc->sk);
if (!wc_status) { if (!wc_status) {
@ -51,6 +47,12 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
conn); conn);
conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
} }
if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) &&
unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
wake_up(&conn->cdc_pend_tx_wq);
WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
smc_tx_sndbuf_nonfull(smc); smc_tx_sndbuf_nonfull(smc);
bh_unlock_sock(&smc->sk); bh_unlock_sock(&smc->sk);
} }
@ -107,6 +109,10 @@ int smc_cdc_msg_send(struct smc_connection *conn,
conn->tx_cdc_seq++; conn->tx_cdc_seq++;
conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed);
atomic_inc(&conn->cdc_pend_tx_wr);
smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
if (!rc) { if (!rc) {
smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn);
@ -114,6 +120,7 @@ int smc_cdc_msg_send(struct smc_connection *conn,
} else { } else {
conn->tx_cdc_seq--; conn->tx_cdc_seq--;
conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
atomic_dec(&conn->cdc_pend_tx_wr);
} }
return rc; return rc;
@ -136,7 +143,18 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn,
peer->token = htonl(local->token); peer->token = htonl(local->token);
peer->prod_flags.failover_validation = 1; peer->prod_flags.failover_validation = 1;
/* We need to set pend->conn here to make sure smc_cdc_tx_handler()
* can handle properly
*/
smc_cdc_add_pending_send(conn, pend);
atomic_inc(&conn->cdc_pend_tx_wr);
smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
if (unlikely(rc))
atomic_dec(&conn->cdc_pend_tx_wr);
return rc; return rc;
} }
@ -193,31 +211,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
return rc; return rc;
} }
static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend, void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn)
unsigned long data)
{ {
struct smc_connection *conn = (struct smc_connection *)data; wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr));
struct smc_cdc_tx_pend *cdc_pend =
(struct smc_cdc_tx_pend *)tx_pend;
return cdc_pend->conn == conn;
}
static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend)
{
struct smc_cdc_tx_pend *cdc_pend =
(struct smc_cdc_tx_pend *)tx_pend;
cdc_pend->conn = NULL;
}
void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
{
struct smc_link *link = conn->lnk;
smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE,
smc_cdc_tx_filter, smc_cdc_tx_dismisser,
(unsigned long)conn);
} }
/* Send a SMC-D CDC header. /* Send a SMC-D CDC header.

View File

@ -291,7 +291,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn,
struct smc_wr_buf **wr_buf, struct smc_wr_buf **wr_buf,
struct smc_rdma_wr **wr_rdma_buf, struct smc_rdma_wr **wr_rdma_buf,
struct smc_cdc_tx_pend **pend); struct smc_cdc_tx_pend **pend);
void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn);
int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
struct smc_cdc_tx_pend *pend); struct smc_cdc_tx_pend *pend);
int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);

View File

@ -1127,7 +1127,7 @@ void smc_conn_free(struct smc_connection *conn)
smc_ism_unset_conn(conn); smc_ism_unset_conn(conn);
tasklet_kill(&conn->rx_tsklet); tasklet_kill(&conn->rx_tsklet);
} else { } else {
smc_cdc_tx_dismiss_slots(conn); smc_cdc_wait_pend_tx_wr(conn);
if (current_work() != &conn->abort_work) if (current_work() != &conn->abort_work)
cancel_work_sync(&conn->abort_work); cancel_work_sync(&conn->abort_work);
} }
@ -1204,7 +1204,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
smc_llc_link_clear(lnk, log); smc_llc_link_clear(lnk, log);
smcr_buf_unmap_lgr(lnk); smcr_buf_unmap_lgr(lnk);
smcr_rtoken_clear_link(lnk); smcr_rtoken_clear_link(lnk);
smc_ib_modify_qp_reset(lnk); smc_ib_modify_qp_error(lnk);
smc_wr_free_link(lnk); smc_wr_free_link(lnk);
smc_ib_destroy_queue_pair(lnk); smc_ib_destroy_queue_pair(lnk);
smc_ib_dealloc_protection_domain(lnk); smc_ib_dealloc_protection_domain(lnk);
@ -1336,7 +1336,7 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft)
else else
tasklet_unlock_wait(&conn->rx_tsklet); tasklet_unlock_wait(&conn->rx_tsklet);
} else { } else {
smc_cdc_tx_dismiss_slots(conn); smc_cdc_wait_pend_tx_wr(conn);
} }
smc_lgr_unregister_conn(conn); smc_lgr_unregister_conn(conn);
smc_close_active_abort(smc); smc_close_active_abort(smc);
@ -1459,11 +1459,16 @@ void smc_smcd_terminate_all(struct smcd_dev *smcd)
/* Called when an SMCR device is removed or the smc module is unloaded. /* Called when an SMCR device is removed or the smc module is unloaded.
* If smcibdev is given, all SMCR link groups using this device are terminated. * If smcibdev is given, all SMCR link groups using this device are terminated.
* If smcibdev is NULL, all SMCR link groups are terminated. * If smcibdev is NULL, all SMCR link groups are terminated.
*
* We must wait here for QPs been destroyed before we destroy the CQs,
* or we won't received any CQEs and cdc_pend_tx_wr cannot reach 0 thus
* smc_sock cannot be released.
*/ */
void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
{ {
struct smc_link_group *lgr, *lg; struct smc_link_group *lgr, *lg;
LIST_HEAD(lgr_free_list); LIST_HEAD(lgr_free_list);
LIST_HEAD(lgr_linkdown_list);
int i; int i;
spin_lock_bh(&smc_lgr_list.lock); spin_lock_bh(&smc_lgr_list.lock);
@ -1475,7 +1480,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
if (lgr->lnk[i].smcibdev == smcibdev) if (lgr->lnk[i].smcibdev == smcibdev)
smcr_link_down_cond_sched(&lgr->lnk[i]); list_move_tail(&lgr->list, &lgr_linkdown_list);
} }
} }
} }
@ -1487,6 +1492,16 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
__smc_lgr_terminate(lgr, false); __smc_lgr_terminate(lgr, false);
} }
list_for_each_entry_safe(lgr, lg, &lgr_linkdown_list, list) {
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
if (lgr->lnk[i].smcibdev == smcibdev) {
mutex_lock(&lgr->llc_conf_mutex);
smcr_link_down_cond(&lgr->lnk[i]);
mutex_unlock(&lgr->llc_conf_mutex);
}
}
}
if (smcibdev) { if (smcibdev) {
if (atomic_read(&smcibdev->lnk_cnt)) if (atomic_read(&smcibdev->lnk_cnt))
wait_event(smcibdev->lnks_deleted, wait_event(smcibdev->lnks_deleted,
@ -1586,7 +1601,6 @@ static void smcr_link_down(struct smc_link *lnk)
if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list)) if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list))
return; return;
smc_ib_modify_qp_reset(lnk);
to_lnk = smc_switch_conns(lgr, lnk, true); to_lnk = smc_switch_conns(lgr, lnk, true);
if (!to_lnk) { /* no backup link available */ if (!to_lnk) { /* no backup link available */
smcr_link_clear(lnk, true); smcr_link_clear(lnk, true);
@ -1824,6 +1838,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
conn->urg_state = SMC_URG_READ; conn->urg_state = SMC_URG_READ;
init_waitqueue_head(&conn->cdc_pend_tx_wq);
INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work);
if (ini->is_smcd) { if (ini->is_smcd) {
conn->rx_off = sizeof(struct smcd_cdc_msg); conn->rx_off = sizeof(struct smcd_cdc_msg);

View File

@ -109,12 +109,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk)
IB_QP_MAX_QP_RD_ATOMIC); IB_QP_MAX_QP_RD_ATOMIC);
} }
int smc_ib_modify_qp_reset(struct smc_link *lnk) int smc_ib_modify_qp_error(struct smc_link *lnk)
{ {
struct ib_qp_attr qp_attr; struct ib_qp_attr qp_attr;
memset(&qp_attr, 0, sizeof(qp_attr)); memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.qp_state = IB_QPS_RESET; qp_attr.qp_state = IB_QPS_ERR;
return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
} }

View File

@ -90,6 +90,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk);
int smc_ib_ready_link(struct smc_link *lnk); int smc_ib_ready_link(struct smc_link *lnk);
int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk);
int smc_ib_modify_qp_reset(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk);
int smc_ib_modify_qp_error(struct smc_link *lnk);
long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
struct smc_buf_desc *buf_slot, u8 link_idx); struct smc_buf_desc *buf_slot, u8 link_idx);

View File

@ -62,13 +62,9 @@ static inline bool smc_wr_is_tx_pend(struct smc_link *link)
} }
/* wait till all pending tx work requests on the given link are completed */ /* wait till all pending tx work requests on the given link are completed */
int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
{ {
if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
SMC_WR_TX_WAIT_PENDING_TIME))
return 0;
else /* timeout */
return -EPIPE;
} }
static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
@ -87,7 +83,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
struct smc_wr_tx_pend pnd_snd; struct smc_wr_tx_pend pnd_snd;
struct smc_link *link; struct smc_link *link;
u32 pnd_snd_idx; u32 pnd_snd_idx;
int i;
link = wc->qp->qp_context; link = wc->qp->qp_context;
@ -128,14 +123,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
} }
if (wc->status) { if (wc->status) {
for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
/* clear full struct smc_wr_tx_pend including .priv */
memset(&link->wr_tx_pends[i], 0,
sizeof(link->wr_tx_pends[i]));
memset(&link->wr_tx_bufs[i], 0,
sizeof(link->wr_tx_bufs[i]));
clear_bit(i, link->wr_tx_mask);
}
if (link->lgr->smc_version == SMC_V2) { if (link->lgr->smc_version == SMC_V2) {
memset(link->wr_tx_v2_pend, 0, memset(link->wr_tx_v2_pend, 0,
sizeof(*link->wr_tx_v2_pend)); sizeof(*link->wr_tx_v2_pend));
@ -421,25 +408,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
return rc; return rc;
} }
void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type,
smc_wr_tx_filter filter,
smc_wr_tx_dismisser dismisser,
unsigned long data)
{
struct smc_wr_tx_pend_priv *tx_pend;
struct smc_wr_rx_hdr *wr_tx;
int i;
for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i];
if (wr_tx->type != wr_tx_hdr_type)
continue;
tx_pend = &link->wr_tx_pends[i].priv;
if (filter(tx_pend, data))
dismisser(tx_pend);
}
}
/****************************** receive queue ********************************/ /****************************** receive queue ********************************/
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
@ -675,10 +643,7 @@ void smc_wr_free_link(struct smc_link *lnk)
smc_wr_wakeup_reg_wait(lnk); smc_wr_wakeup_reg_wait(lnk);
smc_wr_wakeup_tx_wait(lnk); smc_wr_wakeup_tx_wait(lnk);
if (smc_wr_tx_wait_no_pending_sends(lnk)) smc_wr_tx_wait_no_pending_sends(lnk);
memset(lnk->wr_tx_mask, 0,
BITS_TO_LONGS(SMC_WR_BUF_CNT) *
sizeof(*lnk->wr_tx_mask));
wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));

View File

@ -22,7 +22,6 @@
#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ #define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ)
#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */ #define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
@ -130,7 +129,7 @@ void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
smc_wr_tx_filter filter, smc_wr_tx_filter filter,
smc_wr_tx_dismisser dismisser, smc_wr_tx_dismisser dismisser,
unsigned long data); unsigned long data);
int smc_wr_tx_wait_no_pending_sends(struct smc_link *link); void smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
int smc_wr_rx_post_init(struct smc_link *link); int smc_wr_rx_post_init(struct smc_link *link);