Drivers: hv: vmbus: Offload the handling of channels to two workqueues
vmbus_process_offer() mustn't call channel->sc_creation_callback() directly for sub-channels, because sc_creation_callback() -> vmbus_open() may never get the host's response to the OPEN_CHANNEL message (the host may rescind a channel at any time, e.g. in the case of hot removing a NIC), and vmbus_onoffer_rescind() may not wake up the vmbus_open() as it's blocked due to a non-zero vmbus_connection.offer_in_progress, and finally we have a deadlock. The above is also true for primary channels, if the related device drivers use sync probing mode by default. And, usually the handling of primary channels and sub-channels can depend on each other, so we should offload them to different workqueues to avoid possible deadlock, e.g. in sync-probing mode, NIC1's netvsc_subchan_work() can race with NIC2's netvsc_probe() -> rtnl_lock(), and causes deadlock: the former gets the rtnl_lock and waits for all the sub-channels to appear, but the latter can't get the rtnl_lock and this blocks the handling of sub-channels. The patch can fix the multiple-NIC deadlock described above for v3.x kernels (e.g. RHEL 7.x) which don't support async-probing of devices, and v4.4, v4.9, v4.14 and v4.18 which support async-probing but don't enable async-probing for Hyper-V drivers (yet). The patch can also fix the hang issue in sub-channel's handling described above for all versions of kernels, including v4.19 and v4.20-rc4. So actually the patch should be applied to all the existing kernels, not only the kernels that have8195b1396e
. Fixes:8195b1396e
("hv_netvsc: fix deadlock on hotplug") Cc: stable@vger.kernel.org Cc: Stephen Hemminger <sthemmin@microsoft.com> Cc: K. Y. Srinivasan <kys@microsoft.com> Cc: Haiyang Zhang <haiyangz@microsoft.com> Signed-off-by: Dexuan Cui <decui@microsoft.com> Signed-off-by: K. Y. Srinivasan <kys@microsoft.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
parent
2595646791
commit
37c2578c0c
|
@ -435,61 +435,16 @@ void vmbus_free_channels(void)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* vmbus_process_offer - Process the offer by creating a channel/device
|
||||
* associated with this offer
|
||||
*/
|
||||
static void vmbus_process_offer(struct vmbus_channel *newchannel)
|
||||
/* Note: the function can run concurrently for primary/sub channels. */
|
||||
static void vmbus_add_channel_work(struct work_struct *work)
|
||||
{
|
||||
struct vmbus_channel *channel;
|
||||
bool fnew = true;
|
||||
struct vmbus_channel *newchannel =
|
||||
container_of(work, struct vmbus_channel, add_channel_work);
|
||||
struct vmbus_channel *primary_channel = newchannel->primary_channel;
|
||||
unsigned long flags;
|
||||
u16 dev_type;
|
||||
int ret;
|
||||
|
||||
/* Make sure this is a new offer */
|
||||
mutex_lock(&vmbus_connection.channel_mutex);
|
||||
|
||||
/*
|
||||
* Now that we have acquired the channel_mutex,
|
||||
* we can release the potentially racing rescind thread.
|
||||
*/
|
||||
atomic_dec(&vmbus_connection.offer_in_progress);
|
||||
|
||||
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
|
||||
if (!uuid_le_cmp(channel->offermsg.offer.if_type,
|
||||
newchannel->offermsg.offer.if_type) &&
|
||||
!uuid_le_cmp(channel->offermsg.offer.if_instance,
|
||||
newchannel->offermsg.offer.if_instance)) {
|
||||
fnew = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (fnew)
|
||||
list_add_tail(&newchannel->listentry,
|
||||
&vmbus_connection.chn_list);
|
||||
|
||||
mutex_unlock(&vmbus_connection.channel_mutex);
|
||||
|
||||
if (!fnew) {
|
||||
/*
|
||||
* Check to see if this is a sub-channel.
|
||||
*/
|
||||
if (newchannel->offermsg.offer.sub_channel_index != 0) {
|
||||
/*
|
||||
* Process the sub-channel.
|
||||
*/
|
||||
newchannel->primary_channel = channel;
|
||||
spin_lock_irqsave(&channel->lock, flags);
|
||||
list_add_tail(&newchannel->sc_list, &channel->sc_list);
|
||||
channel->num_sc++;
|
||||
spin_unlock_irqrestore(&channel->lock, flags);
|
||||
} else {
|
||||
goto err_free_chan;
|
||||
}
|
||||
}
|
||||
|
||||
dev_type = hv_get_dev_type(newchannel);
|
||||
|
||||
init_vp_index(newchannel, dev_type);
|
||||
|
@ -507,27 +462,26 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
|
|||
/*
|
||||
* This state is used to indicate a successful open
|
||||
* so that when we do close the channel normally, we
|
||||
* can cleanup properly
|
||||
* can cleanup properly.
|
||||
*/
|
||||
newchannel->state = CHANNEL_OPEN_STATE;
|
||||
|
||||
if (!fnew) {
|
||||
struct hv_device *dev
|
||||
= newchannel->primary_channel->device_obj;
|
||||
if (primary_channel != NULL) {
|
||||
/* newchannel is a sub-channel. */
|
||||
struct hv_device *dev = primary_channel->device_obj;
|
||||
|
||||
if (vmbus_add_channel_kobj(dev, newchannel))
|
||||
goto err_free_chan;
|
||||
goto err_deq_chan;
|
||||
|
||||
if (primary_channel->sc_creation_callback != NULL)
|
||||
primary_channel->sc_creation_callback(newchannel);
|
||||
|
||||
if (channel->sc_creation_callback != NULL)
|
||||
channel->sc_creation_callback(newchannel);
|
||||
newchannel->probe_done = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start the process of binding this offer to the driver
|
||||
* We need to set the DeviceObject field before calling
|
||||
* vmbus_child_dev_add()
|
||||
* Start the process of binding the primary channel to the driver
|
||||
*/
|
||||
newchannel->device_obj = vmbus_device_create(
|
||||
&newchannel->offermsg.offer.if_type,
|
||||
|
@ -556,13 +510,28 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
|
|||
|
||||
err_deq_chan:
|
||||
mutex_lock(&vmbus_connection.channel_mutex);
|
||||
list_del(&newchannel->listentry);
|
||||
|
||||
/*
|
||||
* We need to set the flag, otherwise
|
||||
* vmbus_onoffer_rescind() can be blocked.
|
||||
*/
|
||||
newchannel->probe_done = true;
|
||||
|
||||
if (primary_channel == NULL) {
|
||||
list_del(&newchannel->listentry);
|
||||
} else {
|
||||
spin_lock_irqsave(&primary_channel->lock, flags);
|
||||
list_del(&newchannel->sc_list);
|
||||
spin_unlock_irqrestore(&primary_channel->lock, flags);
|
||||
}
|
||||
|
||||
mutex_unlock(&vmbus_connection.channel_mutex);
|
||||
|
||||
if (newchannel->target_cpu != get_cpu()) {
|
||||
put_cpu();
|
||||
smp_call_function_single(newchannel->target_cpu,
|
||||
percpu_channel_deq, newchannel, true);
|
||||
percpu_channel_deq,
|
||||
newchannel, true);
|
||||
} else {
|
||||
percpu_channel_deq(newchannel);
|
||||
put_cpu();
|
||||
|
@ -570,14 +539,104 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
|
|||
|
||||
vmbus_release_relid(newchannel->offermsg.child_relid);
|
||||
|
||||
err_free_chan:
|
||||
free_channel(newchannel);
|
||||
}
|
||||
|
||||
/*
|
||||
* vmbus_process_offer - Process the offer by creating a channel/device
|
||||
* associated with this offer
|
||||
*/
|
||||
static void vmbus_process_offer(struct vmbus_channel *newchannel)
|
||||
{
|
||||
struct vmbus_channel *channel;
|
||||
struct workqueue_struct *wq;
|
||||
unsigned long flags;
|
||||
bool fnew = true;
|
||||
|
||||
mutex_lock(&vmbus_connection.channel_mutex);
|
||||
|
||||
/*
|
||||
* Now that we have acquired the channel_mutex,
|
||||
* we can release the potentially racing rescind thread.
|
||||
*/
|
||||
atomic_dec(&vmbus_connection.offer_in_progress);
|
||||
|
||||
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
|
||||
if (!uuid_le_cmp(channel->offermsg.offer.if_type,
|
||||
newchannel->offermsg.offer.if_type) &&
|
||||
!uuid_le_cmp(channel->offermsg.offer.if_instance,
|
||||
newchannel->offermsg.offer.if_instance)) {
|
||||
fnew = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (fnew)
|
||||
list_add_tail(&newchannel->listentry,
|
||||
&vmbus_connection.chn_list);
|
||||
else {
|
||||
/*
|
||||
* Check to see if this is a valid sub-channel.
|
||||
*/
|
||||
if (newchannel->offermsg.offer.sub_channel_index == 0) {
|
||||
mutex_unlock(&vmbus_connection.channel_mutex);
|
||||
/*
|
||||
* Don't call free_channel(), because newchannel->kobj
|
||||
* is not initialized yet.
|
||||
*/
|
||||
kfree(newchannel);
|
||||
WARN_ON_ONCE(1);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* Process the sub-channel.
|
||||
*/
|
||||
newchannel->primary_channel = channel;
|
||||
spin_lock_irqsave(&channel->lock, flags);
|
||||
list_add_tail(&newchannel->sc_list, &channel->sc_list);
|
||||
spin_unlock_irqrestore(&channel->lock, flags);
|
||||
}
|
||||
|
||||
mutex_unlock(&vmbus_connection.channel_mutex);
|
||||
|
||||
/*
|
||||
* vmbus_process_offer() mustn't call channel->sc_creation_callback()
|
||||
* directly for sub-channels, because sc_creation_callback() ->
|
||||
* vmbus_open() may never get the host's response to the
|
||||
* OPEN_CHANNEL message (the host may rescind a channel at any time,
|
||||
* e.g. in the case of hot removing a NIC), and vmbus_onoffer_rescind()
|
||||
* may not wake up the vmbus_open() as it's blocked due to a non-zero
|
||||
* vmbus_connection.offer_in_progress, and finally we have a deadlock.
|
||||
*
|
||||
* The above is also true for primary channels, if the related device
|
||||
* drivers use sync probing mode by default.
|
||||
*
|
||||
* And, usually the handling of primary channels and sub-channels can
|
||||
* depend on each other, so we should offload them to different
|
||||
* workqueues to avoid possible deadlock, e.g. in sync-probing mode,
|
||||
* NIC1's netvsc_subchan_work() can race with NIC2's netvsc_probe() ->
|
||||
* rtnl_lock(), and causes deadlock: the former gets the rtnl_lock
|
||||
* and waits for all the sub-channels to appear, but the latter
|
||||
* can't get the rtnl_lock and this blocks the handling of
|
||||
* sub-channels.
|
||||
*/
|
||||
INIT_WORK(&newchannel->add_channel_work, vmbus_add_channel_work);
|
||||
wq = fnew ? vmbus_connection.handle_primary_chan_wq :
|
||||
vmbus_connection.handle_sub_chan_wq;
|
||||
queue_work(wq, &newchannel->add_channel_work);
|
||||
}
|
||||
|
||||
/*
|
||||
* We use this state to statically distribute the channel interrupt load.
|
||||
*/
|
||||
static int next_numa_node_id;
|
||||
/*
|
||||
* init_vp_index() accesses global variables like next_numa_node_id, and
|
||||
* it can run concurrently for primary channels and sub-channels: see
|
||||
* vmbus_process_offer(), so we need the lock to protect the global
|
||||
* variables.
|
||||
*/
|
||||
static DEFINE_SPINLOCK(bind_channel_to_cpu_lock);
|
||||
|
||||
/*
|
||||
* Starting with Win8, we can statically distribute the incoming
|
||||
|
@ -613,6 +672,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
|
|||
return;
|
||||
}
|
||||
|
||||
spin_lock(&bind_channel_to_cpu_lock);
|
||||
|
||||
/*
|
||||
* Based on the channel affinity policy, we will assign the NUMA
|
||||
* nodes.
|
||||
|
@ -695,6 +756,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
|
|||
channel->target_cpu = cur_cpu;
|
||||
channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu);
|
||||
|
||||
spin_unlock(&bind_channel_to_cpu_lock);
|
||||
|
||||
free_cpumask_var(available_mask);
|
||||
}
|
||||
|
||||
|
|
|
@ -190,6 +190,20 @@ int vmbus_connect(void)
|
|||
goto cleanup;
|
||||
}
|
||||
|
||||
vmbus_connection.handle_primary_chan_wq =
|
||||
create_workqueue("hv_pri_chan");
|
||||
if (!vmbus_connection.handle_primary_chan_wq) {
|
||||
ret = -ENOMEM;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
vmbus_connection.handle_sub_chan_wq =
|
||||
create_workqueue("hv_sub_chan");
|
||||
if (!vmbus_connection.handle_sub_chan_wq) {
|
||||
ret = -ENOMEM;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&vmbus_connection.chn_msg_list);
|
||||
spin_lock_init(&vmbus_connection.channelmsg_lock);
|
||||
|
||||
|
@ -280,10 +294,14 @@ void vmbus_disconnect(void)
|
|||
*/
|
||||
vmbus_initiate_unload(false);
|
||||
|
||||
if (vmbus_connection.work_queue) {
|
||||
drain_workqueue(vmbus_connection.work_queue);
|
||||
if (vmbus_connection.handle_sub_chan_wq)
|
||||
destroy_workqueue(vmbus_connection.handle_sub_chan_wq);
|
||||
|
||||
if (vmbus_connection.handle_primary_chan_wq)
|
||||
destroy_workqueue(vmbus_connection.handle_primary_chan_wq);
|
||||
|
||||
if (vmbus_connection.work_queue)
|
||||
destroy_workqueue(vmbus_connection.work_queue);
|
||||
}
|
||||
|
||||
if (vmbus_connection.int_page) {
|
||||
free_pages((unsigned long)vmbus_connection.int_page, 0);
|
||||
|
|
|
@ -335,7 +335,14 @@ struct vmbus_connection {
|
|||
struct list_head chn_list;
|
||||
struct mutex channel_mutex;
|
||||
|
||||
/*
|
||||
* An offer message is handled first on the work_queue, and then
|
||||
* is further handled on handle_primary_chan_wq or
|
||||
* handle_sub_chan_wq.
|
||||
*/
|
||||
struct workqueue_struct *work_queue;
|
||||
struct workqueue_struct *handle_primary_chan_wq;
|
||||
struct workqueue_struct *handle_sub_chan_wq;
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -905,6 +905,13 @@ struct vmbus_channel {
|
|||
|
||||
bool probe_done;
|
||||
|
||||
/*
|
||||
* We must offload the handling of the primary/sub channels
|
||||
* from the single-threaded vmbus_connection.work_queue to
|
||||
* two different workqueue, otherwise we can block
|
||||
* vmbus_connection.work_queue and hang: see vmbus_process_offer().
|
||||
*/
|
||||
struct work_struct add_channel_work;
|
||||
};
|
||||
|
||||
static inline bool is_hvsock_channel(const struct vmbus_channel *c)
|
||||
|
|
Loading…
Reference in New Issue